#!/usr/bin/env julia using DataFrames using CSV function reservedwords() """Only the first set will be used at this stage Thanks to the Web2py code where I got the data in this format. """ POSTGRESQL = Set(( "FALSE", "TRUE", "ALL", "ANALYSE", "ANALYZE", "AND", "ANY", "ARRAY", "AS", "ASC", "ASYMMETRIC", "AUTHORIZATION", "BETWEEN", "BIGINT", "BINARY", "BIT", "BOOLEAN", "BOTH", "CASE", "CAST", "CHAR", "CHARACTER", "CHECK", "COALESCE", "COLLATE", "COLUMN", "CONSTRAINT", "CREATE", "CROSS", "CURRENT_CATALOG", "CURRENT_DATE", "CURRENT_ROLE", "CURRENT_SCHEMA", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_USER", "DEC", "DECIMAL", "DEFAULT", "DEFERRABLE", "DESC", "DISTINCT", "DO", "ELSE", "END", "EXCEPT", "EXISTS", "EXTRACT", "FETCH", "FLOAT", "FOR", "FOREIGN", "FREEZE", "FROM", "FULL", "GRANT", "GREATEST", "GROUP", "HAVING", "ILIKE", "IN", "INITIALLY", "INNER", "INOUT", "INT", "INTEGER", "INTERSECT", "INTERVAL", "INTO", "IS", "ISNULL", "JOIN", "LEADING", "LEAST", "LEFT", "LIKE", "LIMIT", "LOCALTIME", "LOCALTIMESTAMP", "NATIONAL", "NATURAL", "NCHAR", "NEW", "NONE", "NOT", "NOTNULL", "NULL", "NULLIF", "NUMERIC", "OFF", "OFFSET", "OLD", "ON", "ONLY", "OR", "ORDER", "OUT", "OUTER", "OVERLAPS", "OVERLAY", "PLACING", "POSITION", "PRECISION", "PRIMARY", "REAL", "REFERENCES", "RETURNING", "RIGHT", "ROW", "SELECT", "SESSION_USER", "SETOF", "SIMILAR", "SMALLINT", "SOME", "SUBSTRING", "SYMMETRIC", "TABLE", "THEN", "TIME", "TIMESTAMP", "TO", "TRAILING", "TREAT", "TRIM", "UNION", "UNIQUE", "USER", "USING", "VALUES", "VARCHAR", "VARIADIC", "VERBOSE", "WHEN", "WHERE", "WITH", "XMLATTRIBUTES", "XMLCONCAT", "XMLELEMENT", "XMLFOREST", "XMLPARSE", "XMLPI", "XMLROOT", "XMLSERIALIZE", )) POSTGRESQL_NONRESERVED = Set(( "A", "ABORT", "ABS", "ABSENT", "ABSOLUTE", "ACCESS", "ACCORDING", "ACTION", "ADA", "ADD", "ADMIN", "AFTER", "AGGREGATE", "ALIAS", "ALLOCATE", "ALSO", "ALTER", "ALWAYS", "ARE", "ARRAY_AGG", "ASENSITIVE", "ASSERTION", "ASSIGNMENT", "AT", "ATOMIC", "ATTRIBUTE", "ATTRIBUTES", "AVG", "BACKWARD", "BASE64", "BEFORE", "BEGIN", "BERNOULLI", "BIT_LENGTH", "BITVAR", "BLOB", "BOM", "BREADTH", "BY", "C", "CACHE", "CALL", "CALLED", "CARDINALITY", "CASCADE", "CASCADED", "CATALOG", "CATALOG_NAME", "CEIL", "CEILING", "CHAIN", "CHAR_LENGTH", "CHARACTER_LENGTH", "CHARACTER_SET_CATALOG", "CHARACTER_SET_NAME", "CHARACTER_SET_SCHEMA", "CHARACTERISTICS", "CHARACTERS", "CHECKED", "CHECKPOINT", "CLASS", "CLASS_ORIGIN", "CLOB", "CLOSE", "CLUSTER", "COBOL", "COLLATION", "COLLATION_CATALOG", "COLLATION_NAME", "COLLATION_SCHEMA", "COLLECT", "COLUMN_NAME", "COLUMNS", "COMMAND_FUNCTION", "COMMAND_FUNCTION_CODE", "COMMENT", "COMMIT", "COMMITTED", "COMPLETION", "CONCURRENTLY", "CONDITION", "CONDITION_NUMBER", "CONFIGURATION", "CONNECT", "CONNECTION", "CONNECTION_NAME", "CONSTRAINT_CATALOG", "CONSTRAINT_NAME", "CONSTRAINT_SCHEMA", "CONSTRAINTS", "CONSTRUCTOR", "CONTAINS", "CONTENT", "CONTINUE", "CONVERSION", "CONVERT", "COPY", "CORR", "CORRESPONDING", "COST", "COUNT", "COVAR_POP", "COVAR_SAMP", "CREATEDB", "CREATEROLE", "CREATEUSER", "CSV", "CUBE", "CUME_DIST", "CURRENT", "CURRENT_DEFAULT_TRANSFORM_GROUP", "CURRENT_PATH", "CURRENT_TRANSFORM_GROUP_FOR_TYPE", "CURSOR", "CURSOR_NAME", "CYCLE", "DATA", "DATABASE", "DATE", "DATETIME_INTERVAL_CODE", "DATETIME_INTERVAL_PRECISION", "DAY", "DEALLOCATE", "DECLARE", "DEFAULTS", "DEFERRED", "DEFINED", "DEFINER", "DEGREE", "DELETE", "DELIMITER", "DELIMITERS", "DENSE_RANK", "DEPTH", "DEREF", "DERIVED", "DESCRIBE", "DESCRIPTOR", "DESTROY", "DESTRUCTOR", "DETERMINISTIC", "DIAGNOSTICS", "DICTIONARY", "DISABLE", "DISCARD", "DISCONNECT", "DISPATCH", "DOCUMENT", "DOMAIN", "DOUBLE", "DROP", "DYNAMIC", "DYNAMIC_FUNCTION", "DYNAMIC_FUNCTION_CODE", "EACH", "ELEMENT", "EMPTY", "ENABLE", "ENCODING", "ENCRYPTED", "END-EXEC", "ENUM", "EQUALS", "ESCAPE", "EVERY", "EXCEPTION", "EXCLUDE", "EXCLUDING", "EXCLUSIVE", "EXEC", "EXECUTE", "EXISTING", "EXP", "EXPLAIN", "EXTERNAL", "FAMILY", "FILTER", "FINAL", "FIRST", "FIRST_VALUE", "FLAG", "FLOOR", "FOLLOWING", "FORCE", "FORTRAN", "FORWARD", "FOUND", "FREE", "FUNCTION", "FUSION", "G", "GENERAL", "GENERATED", "GET", "GLOBAL", "GO", "GOTO", "GRANTED", "GROUPING", "HANDLER", "HEADER", "HEX", "HIERARCHY", "HOLD", "HOST", "HOUR", # "ID", "IDENTITY", "IF", "IGNORE", "IMMEDIATE", "IMMUTABLE", "IMPLEMENTATION", "IMPLICIT", "INCLUDING", "INCREMENT", "INDENT", "INDEX", "INDEXES", "INDICATOR", "INFIX", "INHERIT", "INHERITS", "INITIALIZE", "INPUT", "INSENSITIVE", "INSERT", "INSTANCE", "INSTANTIABLE", "INSTEAD", "INTERSECTION", "INVOKER", "ISOLATION", "ITERATE", "K", "KEY", "KEY_MEMBER", "KEY_TYPE", "LAG", "LANCOMPILER", "LANGUAGE", "LARGE", "LAST", "LAST_VALUE", "LATERAL", "LC_COLLATE", "LC_CTYPE", "LEAD", "LENGTH", "LESS", "LEVEL", "LIKE_REGEX", "LISTEN", "LN", "LOAD", "LOCAL", "LOCATION", "LOCATOR", "LOCK", "LOGIN", "LOWER", "M", "MAP", "MAPPING", "MATCH", "MATCHED", "MAX", "MAX_CARDINALITY", "MAXVALUE", "MEMBER", "MERGE", "MESSAGE_LENGTH", "MESSAGE_OCTET_LENGTH", "MESSAGE_TEXT", "METHOD", "MIN", "MINUTE", "MINVALUE", "MOD", "MODE", "MODIFIES", "MODIFY", "MODULE", "MONTH", "MORE", "MOVE", "MULTISET", "MUMPS", # "NAME", "NAMES", "NAMESPACE", "NCLOB", "NESTING", "NEXT", "NFC", "NFD", "NFKC", "NFKD", "NIL", "NO", "NOCREATEDB", "NOCREATEROLE", "NOCREATEUSER", "NOINHERIT", "NOLOGIN", "NORMALIZE", "NORMALIZED", "NOSUPERUSER", "NOTHING", "NOTIFY", "NOWAIT", "NTH_VALUE", "NTILE", "NULLABLE", "NULLS", "NUMBER", "OBJECT", "OCCURRENCES_REGEX", "OCTET_LENGTH", "OCTETS", "OF", "OIDS", "OPEN", "OPERATION", "OPERATOR", "OPTION", "OPTIONS", "ORDERING", "ORDINALITY", "OTHERS", "OUTPUT", "OVER", "OVERRIDING", "OWNED", "OWNER", "P", "PAD", "PARAMETER", "PARAMETER_MODE", "PARAMETER_NAME", "PARAMETER_ORDINAL_POSITION", "PARAMETER_SPECIFIC_CATALOG", "PARAMETER_SPECIFIC_NAME", "PARAMETER_SPECIFIC_SCHEMA", "PARAMETERS", "PARSER", "PARTIAL", "PARTITION", "PASCAL", "PASSING", # "PASSWORD", "PATH", "PERCENT_RANK", "PERCENTILE_CONT", "PERCENTILE_DISC", "PLANS", "PLI", "POSITION_REGEX", "POSTFIX", "POWER", "PRECEDING", "PREFIX", "PREORDER", "PREPARE", "PREPARED", "PRESERVE", "PRIOR", "PRIVILEGES", "PROCEDURAL", "PROCEDURE", "PUBLIC", "QUOTE", "RANGE", "RANK", "READ", "READS", "REASSIGN", "RECHECK", "RECURSIVE", "REF", "REFERENCING", "REGR_AVGX", "REGR_AVGY", "REGR_COUNT", "REGR_INTERCEPT", "REGR_R2", "REGR_SLOPE", "REGR_SXX", "REGR_SXY", "REGR_SYY", "REINDEX", "RELATIVE", "RELEASE", "RENAME", "REPEATABLE", "REPLACE", "REPLICA", "RESET", "RESPECT", "RESTART", "RESTRICT", "RESULT", "RETURN", "RETURNED_CARDINALITY", "RETURNED_LENGTH", "RETURNED_OCTET_LENGTH", "RETURNED_SQLSTATE", "RETURNS", "REVOKE", "ROLE", "ROLLBACK", "ROLLUP", "ROUTINE", "ROUTINE_CATALOG", "ROUTINE_NAME", "ROUTINE_SCHEMA", "ROW_COUNT", "ROW_NUMBER", "ROWS", "RULE", "SAVEPOINT", "SCALE", "SCHEMA", "SCHEMA_NAME", "SCOPE", "SCOPE_CATALOG", "SCOPE_NAME", "SCOPE_SCHEMA", "SCROLL", "SEARCH", "SECOND", "SECTION", "SECURITY", "SELF", "SENSITIVE", "SEQUENCE", "SERIALIZABLE", "SERVER", "SERVER_NAME", "SESSION", "SET", "SETS", "SHARE", "SHOW", "SIMPLE", "SIZE", "SOURCE", "SPACE", "SPECIFIC", "SPECIFIC_NAME", "SPECIFICTYPE", "SQL", "SQLCODE", "SQLERROR", "SQLEXCEPTION", "SQLSTATE", "SQLWARNING", "SQRT", "STABLE", "STANDALONE", "START", "STATE", "STATEMENT", "STATIC", "STATISTICS", "STDDEV_POP", "STDDEV_SAMP", "STDIN", "STDOUT", "STORAGE", "STRICT", "STRIP", "STRUCTURE", "STYLE", "SUBCLASS_ORIGIN", "SUBLIST", "SUBMULTISET", "SUBSTRING_REGEX", "SUM", "SUPERUSER", "SYSID", "SYSTEM", "SYSTEM_USER", "T", "TABLE_NAME", "TABLESAMPLE", "TABLESPACE", "TEMP", "TEMPLATE", "TEMPORARY", "TERMINATE", "TEXT", "THAN", "TIES", "TIMEZONE_HOUR", "TIMEZONE_MINUTE", "TOP_LEVEL_COUNT", "TRANSACTION", "TRANSACTION_ACTIVE", "TRANSACTIONS_COMMITTED", "TRANSACTIONS_ROLLED_BACK", "TRANSFORM", "TRANSFORMS", "TRANSLATE", "TRANSLATE_REGEX", "TRANSLATION", "TRIGGER", "TRIGGER_CATALOG", "TRIGGER_NAME", "TRIGGER_SCHEMA", "TRIM_ARRAY", "TRUNCATE", "TRUSTED", "TYPE", "UESCAPE", "UNBOUNDED", "UNCOMMITTED", "UNDER", "UNENCRYPTED", "UNKNOWN", "UNLISTEN", "UNNAMED", "UNNEST", "UNTIL", "UNTYPED", "UPDATE", "UPPER", "URI", "USAGE", "USER_DEFINED_TYPE_CATALOG", "USER_DEFINED_TYPE_CODE", "USER_DEFINED_TYPE_NAME", "USER_DEFINED_TYPE_SCHEMA", "VACUUM", "VALID", "VALIDATOR", "VALUE", "VAR_POP", "VAR_SAMP", "VARBINARY", "VARIABLE", "VARYING", "VERSION", "VIEW", "VOLATILE", "WHENEVER", "WHITESPACE", "WIDTH_BUCKET", "WINDOW", "WITHIN", "WITHOUT", "WORK", "WRAPPER", "WRITE", "XML", "XMLAGG", "XMLBINARY", "XMLCAST", "XMLCOMMENT", "XMLDECLARATION", "XMLDOCUMENT", "XMLEXISTS", "XMLITERATE", "XMLNAMESPACES", "XMLQUERY", "XMLSCHEMA", "XMLTABLE", "XMLTEXT", "XMLVALIDATE", "YEAR", "YES", "ZONE", )) combined = union(POSTGRESQL, POSTGRESQL_NONRESERVED) POSTGRESQL end function field_type(df) """Get the names and field types from the dataframe""" nr,nc = size(df) fields = names(df) fl = [(string(fields[i]), string(eltype(df[i][1:nr]))) for i in 1:nc] end function jt_to_sql(t) """Convert the Julia-datatypes to Postgresql""" md = Dict( "UTF8String"=>"text", "Int64"=>"bigint", "Int8"=>"smallint", "UInt8"=>"smallint", "Int16"=>"integer", "UInt16"=>"integer", "Int32"=>"integer", "UInt32"=>"integer", "UInt64"=>"bigint", "Int128"=>"integer", "UInt128"=>"integer", "Float64"=>"numeric", "Char"=>"char(1)", "Bool"=>"boolean") if haskey(md,t) md[t] else "text" end end function correct_fieldvalue(v) """ replace xml-values with utf-8 replace ' with '' """ v = replace(v, r"'", "''") v = replace(v, r">", ">") v = replace(v, r"<", "<") end function row_of_strings(r) """Prepare a row of strings to print from one row""" nrows,ncols = size(r) s = fill("",ncols) for col in 1:ncols v = r[nrows,col] if ismissing(v) v = "\\N" else v = string(v) end s[col] = correct_fieldvalue(v) end join(s,"\t") end function correct_fieldname(f,pg_keywords) """ correct fieldnames that starts with either a numerical or a underscore character and add a 'n_' in front of the name """ syfers = r"^[0-9_]" if uppercase(f) in pg_keywords f = "n_$f" elseif ismatch(syfers,f) f = "n_$f" else f end f end function create_table_def(df,t) nrows,ncols = size(df) #rows and columns fls = field_type(df) # get the fieldnames and types pg_keywords = reservedwords() # Create a list of fieldnames fns = [correct_fieldname(x[1],pg_keywords) for x in fls] s = "CREATE TABLE $t (" for c in 1:length(fls) (i,f) = fls[c] #i = fieldname f = type if i == "id" st = "serial" else st = jt_to_sql(f) # postgresql-type end i = correct_fieldname(i,pg_keywords) s = s * "$i $st,\n" end ss = s[1:length(s)-2] * ");\n\n" # remove last ',' and '\n' and add ');' # The following header was taken from a file created by pg_dump header = """SET statement_timeout = 0; SET lock_timeout = 0; SET idle_in_transaction_session_timeout = 0; SET client_encoding = 'UTF8'; SET standard_conforming_strings = on; SET check_function_bodies = false; SET client_min_messages = warning; SET row_security = off; SET search_path = public, pg_catalog; SET default_tablespace = ''; SET default_with_oids = false;\n\n""" print(header) print(ss) # Create a line like this: # COPY aa (article, sa_author, id, last_name, first_name, aa) FROM stdin; cs = "COPY $t (" z = join(fns, ", ") css = cs * z * ") FROM stdin;\n" print(css) format_rows(df) end function format_rows(df) """Create rows of tab-delimited data from the values in the dataframe""" nrows,ncols = size(df) for row in 1:nrows s = row_of_strings(df[row,1:ncols]) println(s) end println("\\.") end function addcolumn(df, c, v ) # Add a column with the name c and value v(in all the rows) to df df[c] = v df end function add_id(df) # if there is a field "id" do nothing else add it nrows, = size(df) (:id in names(df)) || addcolumn(df, :id, 1:nrows) end if length(ARGS) < 2 println println("-----------------------------------------------------------------------------") println("Usage: julia csv_to_sql outputfile.sql] ") println("If you do not redirect it the output will be to stdout") println("-----------------------------------------------------------------------------") println exit(1) end #function __init__() input_file = ARGS[1] table = ARGS[2] df = CSV.read(input_file) df = add_id(df) # Add id-field if it is not present in dataframe create_table_def(df,table) #end