-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimportDeltas.py
More file actions
118 lines (111 loc) · 4.12 KB
/
Copy pathimportDeltas.py
File metadata and controls
118 lines (111 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re, json, pymongo
import gzip, sys, os, shutil, pickle
def parse(text, name):
size = 0
data = {}
data['name'] = decode(name)
data['commits'] = []
if text == '':
return (data, 0)
last_revision = ''
text .replace ('\r', '')
# Determine if it uses Git
if text.find('ENDOFCOMMENT') > 0 and text[:text.find(';')].find(':') > 0:
# It's Mercurial
changes = text.strip().split('ENDOFCOMMENT\n')
commit = {}
size += len (text)
for change in changes:
splits = change.split(';')
# There may be semicolons in the comments too
props = splits[:7]
comment = ';'.join(splits[7:])
revision = props[0]
if revision != last_revision:
# Save old commit
data['commits'].append(commit)
# Build a new commit
commit = {}
commit['revision'] = revision
commit['vcs'] = 'hg'
commit['parent'] = props[1]
commit['branch'] = props[2]
commit['date'] = props[4]
commit['author_login'] = decode(props[5])
commit['author'] = decode(props[6])
commit['comment'] = decode(comment.strip())
commit['files'] = []
commit['files'].append({'name':decode(props[3])})
last_revision = revision
else:
# Just add to files
commit['files'].append({'name':decode(props[3])})
else:
# It's Git
size += len (text)
lines = text.splitlines()
commit = {}
for line in lines:
props = line.split(';')
revision = props[0]
if revision != last_revision:
# Save old commit
data['commits'].append(commit)
# Build a new commit
commit = {}
commit['revision'] = revision
commit['vcs'] = 'git'
commit['author'] = decode(props[1])
commit['committer'] = decode(props[2])
commit['author_login'] = decode(props[3])
commit['committer_login'] = decode(props[4])
commit['author_time'] = props[6]
commit['committer_time'] = props[7]
commit['comment'] = decode(';'.join(props[9:]).strip())
locs = props[5].split(':')
commit['files'] = []
commit['files'].append({'name': decode(props[8]), 'loc_added': locs[0], 'loc_deleted': locs[1]})
last_revision = revision
else:
# Just add to files
locs = props[5].split(':')
commit['files'].append({'name': decode(props[8]), 'loc_added': locs[0], 'loc_deleted': locs[1]})
return data, size
def chunks(l, n):
if n < 1:
n = 1
return [l[i:i + n] for i in range(0, len(l), n)]
def decode(text):
return str(text).encode('string_escape')
if __name__ == '__main__':
delta_dir = 'delta/'
client = pymongo.MongoClient(host="da0.eecs.utk.edu")
db = client['bitbucket']
deltas = db['deltas']
# Get list of files
filenames = open('deltas.todo', 'r').readlines()
counter = 0
for filename in filenames:
filename = filename.strip()
print filename,
contents = gzip.open(delta_dir + filename).read()
delta, size = parse(contents, filename.replace('.delta.gz', '').replace('bitbucket.org_', ''))
# size = sys.getsizeof(delta)
#size += sys.getsizeof(pickle.dumps(delta))
size += sys.getsizeof(delta)
try:
if size < 16777216 / 3:
deltas.insert(delta)
else:
s = size
n = 3 * s / 16777216
i = 0
for ch in chunks(delta['commits'], n):
deltas.insert({'name': delta['name'], 'commits': ch, 'chunk': i})
i += 1
print counter
sys.stdout.flush()
counter += 1
except Exception as e:
print 'error'
sys.stderr.write (filename + " could not store\n")