-
Notifications
You must be signed in to change notification settings - Fork 34
Expand file tree
/
Copy pathiter-and-warc.py
More file actions
executable file
·36 lines (27 loc) · 948 Bytes
/
iter-and-warc.py
File metadata and controls
executable file
·36 lines (27 loc) · 948 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python
import cdx_toolkit
cdx = cdx_toolkit.CDXFetcher(source='cc')
url = 'commoncrawl.org/blog/*'
warcinfo = {
'software': 'pypi_cdx_toolkit iter-and-warc example',
'isPartOf': 'EXAMPLE-COMMONCRAWL',
'description': 'warc extraction',
'format': 'WARC file version 1.0',
}
writer = cdx_toolkit.warc.get_writer('EXAMPLE', 'COMMONCRAWL', warcinfo, warc_version='1.1')
for obj in cdx.iter(url, limit=10):
url = obj['url']
status = obj['status']
timestamp = obj['timestamp']
print('considering extracting url', url, 'timestamp', timestamp)
if status != '200':
print(' skipping because status was {}, not 200'.format(status))
continue
try:
record = obj.fetch_warc_record()
except RuntimeError:
print(' skipping capture for RuntimeError 404: %s %s', url, timestamp)
continue
writer.write_record(record)
print(' wrote', url)
writer.close()