2016-06-22 Import many flat files into Sqlite3 databaseΒΆ

This is a simple script to import big files into a sqlite3 database, too big to fit in memory. It relies on function import_flatfile_into_database.

# path to your data
datapath = "<somewhere>"

from pyensae.sql import import_flatfile_into_database, Database
import os

# get the list of files
csv = [_ for _ in os.listdir(datapath) if ".csv" in _]

# name of the database
file_db = os.path.join(datapath, "datanase.db3")

# retrieve the list of tables in the database
db = Database(file_db)
db.connect()
tables = db.get_table_list()
db.close()

# correct table name
def table_name(s):
    return os.path.split(s)[-1].split(".")[0].replace("-", "_")

# loop on csv files and import each of them if the
# corresponding table is not already here
# current speed: around 15min/Gb
for f in csv:
    print("import", f)
    name = table_name(f)
    if name not in tables:
        import_flatfile_into_database(file_db, os.path.join(datapath, f),table_name(f))

The other option is to use pandas.to_sql. It is implemented in function import_flatfile_into_database_pandas. It would look like this:

import os
import sqlite3
import pandas
from pyensae.sql import Database

datapath = "<somepath>"
csv = [_ for _ in os.listdir(datapath) if ".csv" in _]

file_db = os.path.join(datapath, "database.db3")

db = Database(file_db)
db.connect()
tables = db.get_table_list()
db.close()

def table_name(s):
    return os.path.split(s)[-1].split(".")[0].replace("-", "_")

with sqlite3.connect(file_db) as con:

    for f in csv:
        print("import", f)
        name = table_name(f)
        if name not in tables:

            params = {'filepath_or_buffer': os.path.join(datapath, f),
                      'encoding': "utf-8", 'sep':"," ,
                      'iterator': True, 'chunksize':1000000}
            nb = 0
            for part in pandas.read_csv(**params):
                nb += part.shape[0]
                print("number of added lines", nb)
                part.to_sql(con=con, name=name, if_exists="append", index=False)

            con.commit()