-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_crawler.py
More file actions
60 lines (42 loc) · 1.58 KB
/
test_crawler.py
File metadata and controls
60 lines (42 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pytest
import requests
import crawler
@pytest.fixture
def getter(request):
def mock_get(url):
def internet_forever():
counter = 1
while True:
if "relative_url" in str(request.function):
host = ""
else:
host = "https://www.touchsurgery.com"
response = requests.Response()
page = f'<html><a href="{host}/{counter}"></a></html>'
response._content = bytes(page, "utf8")
counter += 1
yield response
if not hasattr(mock_get, "generator"):
mock_get.generator = internet_forever()
return next(mock_get.generator)
return mock_get
def test_crawler_visits_site_and_discovers_links(getter):
spider = crawler.Crawler("https://www.example.com", getter)
spider.start(iterations=0)
assert spider.links_to_visit == ["https://www.touchsurgery.com/1"]
def test_crawler_recurses_into_discovered_links(getter):
spider = crawler.Crawler("https://www.example.com", getter)
spider.start(iterations=2)
assert spider.visited_links == [
"https://www.example.com",
"https://www.touchsurgery.com/1",
"https://www.touchsurgery.com/2",
]
def test_crawler_handles_discovering_relative_urls(getter, request):
spider = crawler.Crawler("https://www.example.com", getter)
spider.start(iterations=2)
assert spider.visited_links == [
"https://www.example.com",
"https://www.example.com/1",
"https://www.example.com/1/2",
]