usda-database-api/load_datasets.py at main · imivi/usda-database-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import sqlite3
from env import env

import pandas as pd


def get_csv_paths(folder_path: str) -> list[str]:
    csv_files = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(root, file))

    return csv_files


def initialize_db(db_conn: sqlite3.Connection):
    cursor = db_conn.cursor()

    create_tables_query = """
    CREATE TABLE IF NOT EXISTS food_category (
        id INTEGER PRIMARY KEY,
        code INTEGER,
        description TEXT
    );
    CREATE TABLE IF NOT EXISTS food (
        fdc_id INTEGER PRIMARY KEY,
        data_type TEXT,
        description TEXT,
        food_category_id INTEGER,
        publication_date TEXT
    );
    CREATE TABLE IF NOT EXISTS food_nutrient (
        id INTEGER PRIMARY KEY,
        fdc_id INTEGER,
        nutrient_id INTEGER,
        amount REAL,
        data_points INTEGER,
        derivation_id INTEGER,
        min TEXT,
        max TEXT,
        median TEXT,
        footnote TEXT,
        min_year_acquired TEXT
    );
    CREATE TABLE IF NOT EXISTS nutrient (
        id INTEGER PRIMARY KEY,
        name TEXT,
        unit_name TEXT,
        nutrient_nbr INTEGER,
        rank INTEGER
    );
    """

    cursor.executescript(create_tables_query)

    db_conn.commit()


def insert_rows_into_db(db_cursor: sqlite3.Cursor, table_name, csv_chunk):

    values = [tuple(row) for row in csv_chunk.values]
    placeholders = ", ".join(["?" for _ in csv_chunk.columns])
    insert_query = f"INSERT OR IGNORE INTO {table_name} VALUES ({placeholders})"

    db_cursor.executemany(insert_query, values)


def load_csv_into_table(db_conn: sqlite3.Connection, csv_path: str, table_name: str):

    chunk_size = 1000

    csv_chunks = pd.read_csv(
        csv_path,
        chunksize=chunk_size,
        header=0,  # Skip the header row
    )

    cursor = db_conn.cursor()

    for csv_chunk in csv_chunks:
        insert_rows_into_db(cursor, table_name, csv_chunk)

    db_conn.commit()


def get_csv_table(csv_path: str) -> str:
    name, _ = os.path.splitext(csv_path)
    for table_name in ["food", "food_nutrient", "nutrient", "food_category"]:
        if name.endswith(table_name):
            return table_name
    raise Exception("Invalid CSV name, cannot identify table:", csv_path)


def main():

    with sqlite3.connect("database/db.sqlite3") as db_conn:

        # Create tables if they don't exist yet
        if env.INIT_DB:
            initialize_db(db_conn)
            print('✅ Database initialized')

        # Read CSVs and insert rows into database
        if env.SEED_DB:
            for i, csv_path in enumerate(get_csv_paths("datasets")):
                table_name = get_csv_table(csv_path)
                print(i + 1, "CSV inserted into table:", csv_path, "-->", table_name)
                load_csv_into_table(db_conn, csv_path, table_name)
            print('✅ Database seeded')


if __name__ == "__main__":
    main()