-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdump_file_meta_json.py
More file actions
137 lines (106 loc) · 3.47 KB
/
dump_file_meta_json.py
File metadata and controls
137 lines (106 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
import re
import urllib.parse
import semver
def try_semver(version):
try:
return semver.Version.parse(version)
except ValueError:
try:
return semver.Version.parse(version + '.0')
except ValueError:
try:
return semver.Version.parse(version + '.0.0')
except ValueError:
return None
def try_parse_version(version):
# First, strip spaces
base = version.strip()
if len(base) <= 0:
return None
# Pre-release defaults to empty
pre = ''
# Strip leading stuff which isn't a number
m = re.match(r'(^[^-0-9._]*) *(.*)', base)
if m is not None and len(m.groups()) >= 1:
# pre = ('-' + m[1].strip().replace('-', '')) if len(m[1]) > 0 else ''
base = m[2].strip()
# Then see if we have a leading "version"
m = re.match(r'(v|version|ver) *([-0-9._ ]+.*)', base, re.IGNORECASE)
if m is not None and len(m.groups()) > 1:
base = m[2].strip()
# Replace '-', '_', and ',' with '.', and strip leading/trailing whitespace
base = base.replace('_', '.').replace('-', '.').replace(',', '.').strip()
# See if we have something that looks like a release
m = re.match(r'([-0-9.]+)([^0-9.]+.*)', base)
if m is not None and len(m.groups()) > 1:
base = m[1].replace(' ', '_')
pre = '-' + m[2].strip().replace('-','')
# If the first character is a dot, prepend a zero
if base.startswith('.'):
base = '0' + base
# Now remove '.0' from string.
while True:
m = re.match(r'(.*)\.0(.*)', base)
if m is None:
break
base = '.'.join(m.groups())
#print('->',base)
# Remove leading and trailing dots,
# replace double dots with a 0 inbetween
base = base.strip('.')
base = base.replace('..', '.0.')
#print('=>',base)
# Check how many dots we have and pad or remove as needed
if base.count('.') >= 3:
spl = base.split('.')
#print('~',spl)
pre = '-' + '_'.join(spl[3:])
base = '.'.join(spl[:3])
#print('~~',base,pre)
elif base.count('.') == 2:
pass
elif base.count('.') == 1:
base += '.0'
elif base.count('.') == 0:
base += '.0.0'
if len(pre) >= 30:
pre = ''
elif pre:
pre = pre.replace(' ', '').replace('(', '').replace(')', '').replace('+', '').replace('_', '')
# Form our final estimate
test = base + pre
if v := try_semver(test):
return v
return None
def run():
mpath = 'data/file_meta'
opath = 'data/file_meta.json'
meta = {}
with open(mpath, 'r') as f:
for line in f:
url, size, sha256, version = line.split('\t')
url = url.strip()
rest, filename = url.rsplit('/', maxsplit=1)
filename = urllib.parse.quote(filename)
url = f"{rest}/{filename}"
version = version.strip()
v = try_parse_version(version)
rec = {
'size': size,
'sha256': sha256,
'version': version
}
if v is not None:
rec['version_parsed'] = (
v.major,
v.minor,
v.patch,
v.prerelease,
v.build
)
meta[url] = rec
with open(opath, 'w') as f:
json.dump(meta, f, indent=2)
if __name__ == '__main__':
run()