Skip to content

Commit 84264a7

Browse files
committed
addressed comments on inital commit, creates a test file in the ./src/tests folder directory and used the unittest suite to explore the conditions when the function would return an exception, valid/invalid data. made a seperate file for cached_get,.
1 parent a3b373e commit 84264a7

8 files changed

Lines changed: 107 additions & 40 deletions

File tree

webscraper/src/Cheaper_Scraper.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import requests
22
import time
33
from bs4 import BeautifulSoup
4+
from urllib.parse import urlparse
45
import logging
56
from typing import Dict, List, Optional
67
# i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
78
import sys
89
import os
910
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10-
11+
from src.fetch_utils import cached_get
1112
from ABC.base_scraper import BaseScraper
12-
from robot_check import RoboCheck
13+
from src.robot_check import RoboCheck
1314
from functools import lru_cache
1415

1516

@@ -22,7 +23,10 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float
2223
user_agent: User agent string to identify the scraper
2324
delay: Time in seconds to wait between requests
2425
"""
25-
26+
parsed_url = urlparse(base_url)
27+
if not parsed_url.scheme or not parsed_url.netloc:
28+
raise ValueError(f"Invalid base URL: {base_url}")
29+
2630
self.base_url = base_url.rstrip('/')
2731
self.delay = delay
2832
self.user_agent = user_agent
@@ -37,19 +41,7 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float
3741
self.robots = RoboCheck(base_url, user_agent)
3842

3943

40-
@staticmethod
41-
@lru_cache(maxsize=128) # cache up to 128 unique URLs
42-
def _cached_get(url: str, user_agent: str) -> Optional[str]:
43-
print(f"[HTTP Request] Fetching from web: {url}") # <== ADD THIS
44-
headers = {"User-Agent": user_agent}
45-
try:
46-
response = requests.get(url, headers=headers, timeout=10)
47-
response.raise_for_status()
48-
return response.text
49-
except requests.RequestException as e:
50-
logging.error(f"Error fetching {url}: {e}")
51-
return None
52-
44+
5345

5446
def fetch(self, path: str = "/") -> Optional[str]:
5547
"""Fetch content from a specific path.
@@ -65,13 +57,12 @@ def fetch(self, path: str = "/") -> Optional[str]:
6557
logging.warning(f"Disallowed by robots.txt: {path}")
6658
return None
6759

68-
6960
url = self.base_url + path
70-
cached_before = self._cached_get.cache_info().hits
71-
html = self._cached_get(url, self.user_agent)
72-
cached_after = self._cached_get.cache_info().hits
61+
cached_before = cached_get.cache_info().hits
62+
html = cached_get(url, self.user_agent)
63+
cached_after = cached_get.cache_info().hits
7364

74-
if cached_after == cached_before: # No cache hit, so it was fetched
65+
if cached_after == cached_before:
7566
time.sleep(self.delay)
7667

7768
return html
-325 Bytes
Binary file not shown.
782 Bytes
Binary file not shown.

webscraper/src/fetch_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import requests
2+
import logging
3+
from functools import lru_cache
4+
from typing import Optional
5+
6+
7+
@lru_cache(maxsize=128)
8+
def cached_get(url: str, user_agent: str) -> Optional[str]:
9+
print(f"[HTTP Request] Fetching from web: {url}")
10+
headers = {"User-Agent": user_agent}
11+
try:
12+
response = requests.get(url, headers=headers, timeout=10)
13+
response.raise_for_status()
14+
return response.text
15+
except requests.RequestException as e:
16+
logging.error(f"Error fetching {url}: {e}")
17+
return None
18+
19+
20+
21+

webscraper/src/main.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,3 @@ def main():
3434
main()
3535

3636

37-
38-
39-
# For testing cache
40-
# def main():
41-
# scraper = CheaperScraper("https://books.toscrape.com")
42-
43-
# print("=== First Request ===")
44-
# start = time.time()
45-
# html1 = scraper.fetch("/") # should print: [HTTP Request] ...
46-
# print("Time taken:", round(time.time() - start, 2), "seconds\n")
47-
48-
# print("=== Second Request (Should Be Cached) ===")
49-
# start = time.time()
50-
# html2 = scraper.fetch("/") # should NOT print: [HTTP Request] ...
51-
# print("Time taken:", round(time.time() - start, 2), "seconds\n")
52-
# print("Cache stats:", scraper._cached_get.cache_info())
53-
54-
# if __name__ == "__main__":
55-
# main()
164 Bytes
Binary file not shown.
Binary file not shown.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import unittest
2+
import time
3+
import sys
4+
import os
5+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
6+
from src.fetch_utils import cached_get
7+
8+
from src.Cheaper_Scraper import CheaperScraper
9+
10+
#to test, be in the webscraper directory and use the following command in terminal
11+
# python src/tests/test_fetch_and_cache.py -v
12+
13+
14+
class TestCheaperScraperFetchCache(unittest.TestCase):
15+
16+
def setUp(self):
17+
self.scraper = CheaperScraper("https://books.toscrape.com")
18+
cached_get.cache_clear() # Reset cache before each test
19+
20+
def test_valid_fetch(self):
21+
html = self.scraper.fetch("/")
22+
self.assertIsInstance(html, str)
23+
self.assertIn("<html", html.lower())
24+
25+
def test_invalid_path_fetch(self):
26+
html = self.scraper.fetch("/this-page-does-not-exist")
27+
# Even though it doesn't exist, the site may return a 200 with a 404 page
28+
self.assertTrue(html is None or "<html" in html.lower())
29+
30+
def test_cache_effectiveness(self):
31+
start = time.time()
32+
self.scraper.fetch("/") # First fetch
33+
time1 = time.time() - start
34+
35+
start = time.time()
36+
self.scraper.fetch("/") # Second fetch (should be cached)
37+
time2 = time.time() - start
38+
39+
cache_info = cached_get.cache_info()
40+
self.assertLess(time2, time1)
41+
self.assertGreaterEqual(cache_info.hits, 1)
42+
43+
def test_non_http_url(self):
44+
with self.assertRaises(ValueError):
45+
CheaperScraper("not_a_real_url")
46+
47+
def test_cache_timing_and_stats(self):
48+
print("\n=== Cache Timing and Stats Test ===")
49+
50+
# First fetch (expected to be slow and hit the network)
51+
start = time.time()
52+
html1 = self.scraper.fetch("/")
53+
time1 = round(time.time() - start, 2)
54+
print(f"First fetch took: {time1} seconds")
55+
56+
# Second fetch (expected to be fast due to cache)
57+
start = time.time()
58+
html2 = self.scraper.fetch("/")
59+
time2 = round(time.time() - start, 2)
60+
print(f"Second fetch took: {time2} seconds")
61+
62+
# Confirm that the second fetch was faster
63+
self.assertLess(time2, time1, "Second fetch should be faster due to caching")
64+
65+
# Print and assert cache stats
66+
stats = cached_get.cache_info()
67+
print("Cache stats:", stats)
68+
self.assertGreaterEqual(stats.hits, 1, "There should be at least 1 cache hit")
69+
70+
71+
72+
73+
if __name__ == "__main__":
74+
unittest.main()

0 commit comments

Comments
 (0)