misc-tools/data_conversion_util.py at master · CSNLab/misc-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python

#
# Author: Meng Du
# August 2020
#

"""
This script contains a few functions to convert data files between
json, csv and python dictionary/list, or wide and long format.
e.g. converting a json file to a python dictionary or a pickle file,
     flatten the dictionary to a list,
     converting data from wide format to long format,
     save the list as a csv file, etc.

Example 1: Convert a json file containing only one json object to a csv file
    # Read a json file to a dictionary
    # This might take a while, so if you need to load a same large json file multiple times
    # you can save it as a pickle file and then read from the pickle file instead.
    # Just omit the pkl_file parameter if you don't need that.
    data_dict = load_json('data.json', pkl_file='temp.pkl')
    # with open('temp.pkl', 'r') as infile:
    #     data_dict = pickle.load(infile)

    # Flatten the dictionary to a list
    # data_list is a list of tuples (col_names, values)
    data_list = [flatten(data_dict[sid], sid) for sid in data_dict]
    col_names, data = fill_missing_keys(data_list)

    # Write to csv (in wide format)
    list2csv(data, 'wide_data.csv', col_names)

    # Convert to long format while removing some unnecessary columns
    # See the docstring in cut_and_stack() for its usage
    long_cols, long_data = cut_and_stack(col_names, data, cut_start=64, cut_length=11, cut_number=40,
                                         skip_cols=range(3, 64))

    # Write to csv (in long format)
    list2csv(long_data, 'long_data.csv', long_cols)

Example 2: Convert a folder of json files, each containing multiple json objects, to a csv file
    # Read json files from DATA_FOLDER, and flatten each file
    # Take the file names as IDs while ignoring the last four characters (file extensions)
    data = [flatten(load_json(DATA_FOLDER + datafile, multiple_obj=True), obj_id=datafile[:-4])
            for datafile in os.listdir(DATA_FOLDER)]
    col_names, data = fill_missing_keys(data)

    # Write to csv (in wide format)
    list2csv(data, 'data.csv', col_names)

    # Convert to long format while removing some unnecessary columns
    skipping = range(1, 101) + range(2696, 2700)
    long_cols, long_data = cut_and_stack(col_names, data, cut_start=4, cut_length=9, cut_number=88, skipping)

    # Write to csv (in long format)
    list2csv(long_data, 'long_data.csv', long_cols)
"""


import pickle
import yaml
import csv


def load_json(json_file, multiple_obj=False, pkl_file=None):
    """
    Read a json file as a python object (or a list of python objects)
    :param json_file: string file name
    :param pkl_file: (string file name) optionally save the object as a pickle data file
    :param multiple_obj: (boolean) whether the json file contains multiple objects
                         (must have one object per line) or one object total
    :return: a python object (if multiple_obj is False) or
             a list of python objects (if multiple_obj is True)
    """
    with open(json_file, 'r') as infile:
        if multiple_obj:
            data = [yaml.safe_load(line) for line in infile]
        else:
            data = yaml.safe_load(infile.read())

    if pkl_file:
        with open(pkl_file, 'w') as outfile:
            pickle.dump(data, outfile)

    return data


def flatten(obj, obj_id=None):
    """
    Flatten a nested list or dictionary to a one-dimensional list.
    :param obj: a dictionary
    :param obj_id: if not None, a string 'id' will be added to the beginning of the returned list of names,
                   and this value will be added to the beginning of the returned list of values
    :return a list of string names, and a list of corresponding values
    """
    names, values = ([], []) if obj_id is None else (['id'], [obj_id])

    def _flatten(x, name=''):  # recursion
        if type(x) is dict:
            for k in x:
                _flatten(x[k], name + k + '.')
        elif type(x) is list:
            for i, a in enumerate(x):
                _flatten(a, name + str(i) + '.')
        else:
            if x is not None:
                names.append(name[:-1])
                values.append(x)

    _flatten(obj)
    names, values = zip(*sorted(zip(names, values)))
    names, values = list(names), list(values)
    return names, values


def fill_missing_keys(data_list):
    """
    Given a list of data with another list of column names to each row,
    find the union of column names, and make every row of data have an equal length
    by inserting empty strings at the missing columns.
    Assuming column names are unique and sorted.
    :param data_list: a list of tuples (column_name_list, value_list)
    :return a list of complete column names, and a 2-dimensional list of values
    """
    name_lists, data = zip(*data_list)

    # get a complete list of unique column names
    all_names = sorted(list(set(sum(name_lists, []))))

    # fill empty strings
    for row in range(len(name_lists)):
        name_i = 0  # name_list iterator
        data_i = 0  # data[row] iterator
        for i in range(len(all_names)):  # i: all_names iterator
            if name_i < len(name_lists[row]):
                if name_lists[row][name_i] != all_names[i]:
                    data[row].insert(data_i, '')
                else:
                    name_i += 1
                data_i += 1
            else:  # missing the last column
                data[row].append('')
    return all_names, data


def longest_common_substring(s1, s2):
    # adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python_2
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest or (m[x][y] > 3 and x > x_longest):
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    lcs = s1[x_longest - longest: x_longest]
    for k, c in enumerate(lcs):
        if c.isalnum():
            return lcs[k:]  # start from the first alphanumeric character


def _simple_stack(wide_cols, wide_data, cols, data, start, end, skip_cols, cut_number):
    for col_i in range(start, end):
        if col_i not in skip_cols:
            cols.append(wide_cols[col_i])
            for row_i, row in enumerate(wide_data):
                for new_row_i in range(row_i * cut_number, (row_i + 1) * cut_number):
                    data[new_row_i].append(row[col_i])


def cut_and_stack(wide_cols, wide_data, cut_start, cut_length, cut_number, skip_cols=()):
    """
    Converts data from wide to long format.
    The cut range (cut_start, cut_start + cut_length * cut_number) has to be continuous
    after columns in skip_cols are excluded.
    :param wide_cols: (list) original data column names
    :param wide_data: (2D list) original data; each sublist should have the same length as wide_cols
    :param cut_start: (integer) the index indicating where the repetition start.
                      Columns and data will be cut and stacked right before this start index.
    :param cut_length: (integer) length of each repetition
    :param cut_number: (integer) number of repetitions
    :param skip_cols: (a list of integers) column indexes to be excluded.
    :return: a list of new column names and a list of new data
    """
    cols = []
    n_rows = len(wide_data) * cut_number
    data = [[] for i in range(n_rows)]
    skip_cols = list(skip_cols)
    if len(skip_cols) > 0:
        skip_cols.sort()
    # get cut_end and a list of skipped columns within the cut range
    cut_end = cut_start + cut_length * cut_number  # if no column skipped in the cut range
    skip_cols_in_cut = []
    for col in skip_cols:
        if cut_start <= col < cut_end:
            skip_cols_in_cut.append(col)
            cut_end += 1
    # columns before cut
    _simple_stack(wide_cols, wide_data, cols, data, 0, cut_start, skip_cols, cut_number)
    # cut starts
    second_cut_start = 0
    for row_i, row in enumerate(wide_data):
        skip_i = 0
        for cut in range(cut_number):
            for i in range(cut_length):
                data_row = row_i * cut_number + cut
                col = cut_start + cut * cut_length + i + skip_i
                while skip_i < len(skip_cols_in_cut) and col == skip_cols_in_cut[skip_i]:  # skip
                    skip_i += 1
                    col += 1
                if cut == 1 and i == 0:
                    second_cut_start = col
                if col < len(row):
                    data[data_row].append(row[col])
    # cut ends
    # get column names in cut range
    i, j, col_counter, skip_i, skip_j = cut_start, second_cut_start, 0, 0, 0
    while skip_j < len(skip_cols_in_cut) and skip_cols_in_cut[skip_j] < second_cut_start:
        skip_j += 1
    max_skip_i = int(skip_j)  # make copy
    while col_counter < cut_length:
        while skip_i < max_skip_i and i == skip_cols_in_cut[skip_i]:
            i += 1
            skip_i += 1
        while skip_j < len(skip_cols_in_cut) and j == skip_cols_in_cut[skip_j]:
            j += 1
            skip_j += 1
        name = longest_common_substring(wide_cols[i], wide_cols[j])  # combine first two column names in the cycle
        cols.append(name)
        col_counter += 1
        i += 1
        j += 1
    # columns after cut
    _simple_stack(wide_cols, wide_data, cols, data, cut_end, len(wide_cols), skip_cols, cut_number)
    return cols, data


def list2csv(data, csv_filename, col_names=None, delimiter=','):
    """
    Write a 2-dimensional data list to a csv file.
    """
    with open(csv_filename, 'w') as outfile:
        writer = csv.writer(outfile, delimiter=delimiter)
        if col_names:
            writer.writerow(col_names)
        for row in data:
            writer.writerow(row)