-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_conversion_util.py
More file actions
250 lines (217 loc) · 9.79 KB
/
data_conversion_util.py
File metadata and controls
250 lines (217 loc) · 9.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python
#
# Author: Meng Du
# August 2020
#
"""
This script contains a few functions to convert data files between
json, csv and python dictionary/list, or wide and long format.
e.g. converting a json file to a python dictionary or a pickle file,
flatten the dictionary to a list,
converting data from wide format to long format,
save the list as a csv file, etc.
Example 1: Convert a json file containing only one json object to a csv file
# Read a json file to a dictionary
# This might take a while, so if you need to load a same large json file multiple times
# you can save it as a pickle file and then read from the pickle file instead.
# Just omit the pkl_file parameter if you don't need that.
data_dict = load_json('data.json', pkl_file='temp.pkl')
# with open('temp.pkl', 'r') as infile:
# data_dict = pickle.load(infile)
# Flatten the dictionary to a list
# data_list is a list of tuples (col_names, values)
data_list = [flatten(data_dict[sid], sid) for sid in data_dict]
col_names, data = fill_missing_keys(data_list)
# Write to csv (in wide format)
list2csv(data, 'wide_data.csv', col_names)
# Convert to long format while removing some unnecessary columns
# See the docstring in cut_and_stack() for its usage
long_cols, long_data = cut_and_stack(col_names, data, cut_start=64, cut_length=11, cut_number=40,
skip_cols=range(3, 64))
# Write to csv (in long format)
list2csv(long_data, 'long_data.csv', long_cols)
Example 2: Convert a folder of json files, each containing multiple json objects, to a csv file
# Read json files from DATA_FOLDER, and flatten each file
# Take the file names as IDs while ignoring the last four characters (file extensions)
data = [flatten(load_json(DATA_FOLDER + datafile, multiple_obj=True), obj_id=datafile[:-4])
for datafile in os.listdir(DATA_FOLDER)]
col_names, data = fill_missing_keys(data)
# Write to csv (in wide format)
list2csv(data, 'data.csv', col_names)
# Convert to long format while removing some unnecessary columns
skipping = range(1, 101) + range(2696, 2700)
long_cols, long_data = cut_and_stack(col_names, data, cut_start=4, cut_length=9, cut_number=88, skipping)
# Write to csv (in long format)
list2csv(long_data, 'long_data.csv', long_cols)
"""
import pickle
import yaml
import csv
def load_json(json_file, multiple_obj=False, pkl_file=None):
"""
Read a json file as a python object (or a list of python objects)
:param json_file: string file name
:param pkl_file: (string file name) optionally save the object as a pickle data file
:param multiple_obj: (boolean) whether the json file contains multiple objects
(must have one object per line) or one object total
:return: a python object (if multiple_obj is False) or
a list of python objects (if multiple_obj is True)
"""
with open(json_file, 'r') as infile:
if multiple_obj:
data = [yaml.safe_load(line) for line in infile]
else:
data = yaml.safe_load(infile.read())
if pkl_file:
with open(pkl_file, 'w') as outfile:
pickle.dump(data, outfile)
return data
def flatten(obj, obj_id=None):
"""
Flatten a nested list or dictionary to a one-dimensional list.
:param obj: a dictionary
:param obj_id: if not None, a string 'id' will be added to the beginning of the returned list of names,
and this value will be added to the beginning of the returned list of values
:return a list of string names, and a list of corresponding values
"""
names, values = ([], []) if obj_id is None else (['id'], [obj_id])
def _flatten(x, name=''): # recursion
if type(x) is dict:
for k in x:
_flatten(x[k], name + k + '.')
elif type(x) is list:
for i, a in enumerate(x):
_flatten(a, name + str(i) + '.')
else:
if x is not None:
names.append(name[:-1])
values.append(x)
_flatten(obj)
names, values = zip(*sorted(zip(names, values)))
names, values = list(names), list(values)
return names, values
def fill_missing_keys(data_list):
"""
Given a list of data with another list of column names to each row,
find the union of column names, and make every row of data have an equal length
by inserting empty strings at the missing columns.
Assuming column names are unique and sorted.
:param data_list: a list of tuples (column_name_list, value_list)
:return a list of complete column names, and a 2-dimensional list of values
"""
name_lists, data = zip(*data_list)
# get a complete list of unique column names
all_names = sorted(list(set(sum(name_lists, []))))
# fill empty strings
for row in range(len(name_lists)):
name_i = 0 # name_list iterator
data_i = 0 # data[row] iterator
for i in range(len(all_names)): # i: all_names iterator
if name_i < len(name_lists[row]):
if name_lists[row][name_i] != all_names[i]:
data[row].insert(data_i, '')
else:
name_i += 1
data_i += 1
else: # missing the last column
data[row].append('')
return all_names, data
def longest_common_substring(s1, s2):
# adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python_2
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest or (m[x][y] > 3 and x > x_longest):
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
lcs = s1[x_longest - longest: x_longest]
for k, c in enumerate(lcs):
if c.isalnum():
return lcs[k:] # start from the first alphanumeric character
def _simple_stack(wide_cols, wide_data, cols, data, start, end, skip_cols, cut_number):
for col_i in range(start, end):
if col_i not in skip_cols:
cols.append(wide_cols[col_i])
for row_i, row in enumerate(wide_data):
for new_row_i in range(row_i * cut_number, (row_i + 1) * cut_number):
data[new_row_i].append(row[col_i])
def cut_and_stack(wide_cols, wide_data, cut_start, cut_length, cut_number, skip_cols=()):
"""
Converts data from wide to long format.
The cut range (cut_start, cut_start + cut_length * cut_number) has to be continuous
after columns in skip_cols are excluded.
:param wide_cols: (list) original data column names
:param wide_data: (2D list) original data; each sublist should have the same length as wide_cols
:param cut_start: (integer) the index indicating where the repetition start.
Columns and data will be cut and stacked right before this start index.
:param cut_length: (integer) length of each repetition
:param cut_number: (integer) number of repetitions
:param skip_cols: (a list of integers) column indexes to be excluded.
:return: a list of new column names and a list of new data
"""
cols = []
n_rows = len(wide_data) * cut_number
data = [[] for i in range(n_rows)]
skip_cols = list(skip_cols)
if len(skip_cols) > 0:
skip_cols.sort()
# get cut_end and a list of skipped columns within the cut range
cut_end = cut_start + cut_length * cut_number # if no column skipped in the cut range
skip_cols_in_cut = []
for col in skip_cols:
if cut_start <= col < cut_end:
skip_cols_in_cut.append(col)
cut_end += 1
# columns before cut
_simple_stack(wide_cols, wide_data, cols, data, 0, cut_start, skip_cols, cut_number)
# cut starts
second_cut_start = 0
for row_i, row in enumerate(wide_data):
skip_i = 0
for cut in range(cut_number):
for i in range(cut_length):
data_row = row_i * cut_number + cut
col = cut_start + cut * cut_length + i + skip_i
while skip_i < len(skip_cols_in_cut) and col == skip_cols_in_cut[skip_i]: # skip
skip_i += 1
col += 1
if cut == 1 and i == 0:
second_cut_start = col
if col < len(row):
data[data_row].append(row[col])
# cut ends
# get column names in cut range
i, j, col_counter, skip_i, skip_j = cut_start, second_cut_start, 0, 0, 0
while skip_j < len(skip_cols_in_cut) and skip_cols_in_cut[skip_j] < second_cut_start:
skip_j += 1
max_skip_i = int(skip_j) # make copy
while col_counter < cut_length:
while skip_i < max_skip_i and i == skip_cols_in_cut[skip_i]:
i += 1
skip_i += 1
while skip_j < len(skip_cols_in_cut) and j == skip_cols_in_cut[skip_j]:
j += 1
skip_j += 1
name = longest_common_substring(wide_cols[i], wide_cols[j]) # combine first two column names in the cycle
cols.append(name)
col_counter += 1
i += 1
j += 1
# columns after cut
_simple_stack(wide_cols, wide_data, cols, data, cut_end, len(wide_cols), skip_cols, cut_number)
return cols, data
def list2csv(data, csv_filename, col_names=None, delimiter=','):
"""
Write a 2-dimensional data list to a csv file.
"""
with open(csv_filename, 'w') as outfile:
writer = csv.writer(outfile, delimiter=delimiter)
if col_names:
writer.writerow(col_names)
for row in data:
writer.writerow(row)