forked from vecna/trackmap
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlibtrackmap.py
More file actions
186 lines (134 loc) · 5.17 KB
/
libtrackmap.py
File metadata and controls
186 lines (134 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python
import os, re, json, sys, random, time
import GeoIP
import tldextract
from tldextract import TLDExtract
from subprocess import Popen, PIPE
from termcolor import colored
def get_unique_urls(source_urldir, urldumpsf):
urls = {}
with file(urldumpsf) as f:
for url_request in f.readlines():
if url_request.startswith('http://'):
urls[ url_request[7:].split('/')[0] ] = True
elif url_request.startswith('https://'):
urls[ url_request[8:].split('/')[0] ] = True
elif url_request.startswith('data:'):
continue
else:
print "![ Unexpected link format!", url_request, "from", source_urldir, "]!"
continue
return urls.keys()
def sortify(outputdir):
urldict = {}
skipped = 0
for urldir in os.listdir(outputdir):
if urldir in ['phantom.log', '_traceroutes', 'unique_id', 'used_media_list',
'_verbotracelogs', 'domain.infos', 'country']:
continue
try:
urlfile = os.path.join(outputdir, urldir, '__urls')
related_urls = get_unique_urls(urldir, urlfile)
except IOError or OSError as einfo:
print "Unable to read", urldir, einfo, "skipping"
continue
TLDio = TLDExtract(cache_file='mozilla_tld_file.dat')
for url in related_urls:
if urldict.has_key(url):
skipped +=1
continue
dnsplit= TLDio(url)
urldict.update({url : {
'domain' : dnsplit.domain,
'tld' : dnsplit.suffix,
'subdomain' : dnsplit.subdomain }
})
# note:
# https://raw.github.com/mozilla/gecko-dev/master/netwerk/dns/effective_tld_names.dat
# tldextract is based on this file, and cloudfront.net is readed as TLD. but is fine
# I've only to sum domain + TLD in order to identify the "included entity"
# just to know if the optimization is working well :)
print "multiple entry on", skipped,
return urldict
def url_cleaner(line):
# cleanurl is used to create the dir, media to phantomjs
if line.startswith('http://'):
cleanurl = line[7:]
elif line.startswith('https://'):
cleanurl = line[8:]
print "https will be converted in http =>", line
else:
raise Exception("Invalid protocol in: %s" % line)
while cleanurl[-1] == '/':
cleanurl = cleanurl[:-1]
dirtyoptions = cleanurl.find("?")
if dirtyoptions != -1:
cleanurl = cleanurl[:dirtyoptions]
cleanurl = cleanurl.split('/')[0]
return cleanurl
def load_global_file(GLOBAL_MEDIA_FILE):
global_media_dict = {}
counter = 0
with file(GLOBAL_MEDIA_FILE, 'r') as f:
for line in f.readlines():
line = line[:-1]
if len(line) > 1 and line[0] == '#':
continue
# everything after a 0x20 need to be cut off
line = line.split(' ')[0]
if len(line) < 3:
continue
cleanurl = url_cleaner(line)
counter += 1
global_media_dict.update({ cleanurl : 'global' })
return global_media_dict, counter
GLOBAL_MEDIA_FILE = 'special_media/global'
PERMITTED_SECTIONS = [ 'global', 'national', 'local', 'blog' ]
def media_file_cleanings(linelist, globalfile=GLOBAL_MEDIA_FILE):
"""
From the format
[global]
http://url
# comment
[othersec]
http://otherweb
return { 'url': 'global', 'otherweb': 'othersec' }
"""
retdict = {}
current_section = None
counter_section = 0
for line in linelist:
line = line[:-1]
if len(line) > 1 and line[0] == '#':
continue
# everything after a 0x20 need to be cut off
line = line.split(' ')[0]
if len(line) < 3:
continue
if line.startswith('[') and line.find(']') != -1:
candidate_section = line[1:-1]
if not candidate_section in PERMITTED_SECTIONS:
print "The section in", line, "is invalid: do not match with", PERMITTED_SECTIONS
quit(-1)
# if we hot 'global' section: is special!
if candidate_section == 'global':
retdict, counter_section = load_global_file(globalfile)
print "Global file loaded, with # entries", counter_section
continue
if current_section:
print "Section", current_section, "has got # entries", counter_section
counter_section = 0
current_section = candidate_section
continue
cleanurl = url_cleaner(line)
if not current_section:
print "detected URL", cleanurl, "without a section! (old file format ?"
quit(-1)
if retdict.has_key(cleanurl):
print "Note:", cleanurl, "is duplicated"
retdict.update({ cleanurl: current_section })
counter_section += 1
# the last section is printed here
if current_section:
print "Section", current_section, "has got # entries", counter_section
return retdict