evuventures
diff --git a/‎webscraper/src/Cheaper_Scraper.py‎
Lines changed: 12 additions & 21 deletions b/‎webscraper/src/Cheaper_Scraper.py‎
Lines changed: 12 additions & 21 deletions
diff --git a/‎webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc‎
-325 Bytes b/‎webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc‎
-325 Bytes
diff --git a/‎webscraper/src/__pycache__/fetch_utils.cpython-39.pyc‎
782 Bytes b/‎webscraper/src/__pycache__/fetch_utils.cpython-39.pyc‎
782 Bytes
diff --git a/‎webscraper/src/fetch_utils.py‎
Lines changed: 21 additions & 0 deletions b/‎webscraper/src/fetch_utils.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎webscraper/src/main.py‎
Lines changed: 0 additions & 19 deletions b/‎webscraper/src/main.py‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎webscraper/src/tests/__pycache__/__init__.cpython-39.pyc‎
164 Bytes b/‎webscraper/src/tests/__pycache__/__init__.cpython-39.pyc‎
164 Bytes
diff --git a/‎webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc‎
1.85 KB b/‎webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc‎
1.85 KB
diff --git a/‎webscraper/src/tests/test_fetch_and_cache.py‎
Lines changed: 74 additions & 0 deletions b/‎webscraper/src/tests/test_fetch_and_cache.py‎
Lines changed: 74 additions & 0 deletions
@@ -1,15 +1,16 @@
 import requests
 import time
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
 import logging
 from typing import Dict, List, Optional
 # i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need
 import sys
 import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-
+from src.fetch_utils import cached_get
 from ABC.base_scraper import BaseScraper
-from robot_check import RoboCheck
+from src.robot_check import RoboCheck
 from functools import lru_cache
 
 
@@ -22,7 +23,10 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float
             user_agent: User agent string to identify the scraper
             delay: Time in seconds to wait between requests
         """
-       
+        parsed_url = urlparse(base_url)
+        if not parsed_url.scheme or not parsed_url.netloc:
+            raise ValueError(f"Invalid base URL: {base_url}")
+        
         self.base_url = base_url.rstrip('/')
         self.delay = delay
         self.user_agent = user_agent
@@ -37,19 +41,7 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float
         self.robots = RoboCheck(base_url, user_agent)
 
 
-    @staticmethod
-    @lru_cache(maxsize=128)  # cache up to 128 unique URLs
-    def _cached_get(url: str, user_agent: str) -> Optional[str]:
-        print(f"[HTTP Request] Fetching from web: {url}")  # <== ADD THIS
-        headers = {"User-Agent": user_agent}
-        try:
-            response = requests.get(url, headers=headers, timeout=10)
-            response.raise_for_status()
-            return response.text
-        except requests.RequestException as e:
-            logging.error(f"Error fetching {url}: {e}")
-            return None
-
+    
 
     def fetch(self, path: str = "/") -> Optional[str]:
         """Fetch content from a specific path.
@@ -65,13 +57,12 @@ def fetch(self, path: str = "/") -> Optional[str]:
             logging.warning(f"Disallowed by robots.txt: {path}")
             return None
 
-
         url = self.base_url + path
-        cached_before = self._cached_get.cache_info().hits
-        html = self._cached_get(url, self.user_agent)
-        cached_after = self._cached_get.cache_info().hits
+        cached_before = cached_get.cache_info().hits
+        html = cached_get(url, self.user_agent)
+        cached_after = cached_get.cache_info().hits
 
-        if cached_after == cached_before:  # No cache hit, so it was fetched
+        if cached_after == cached_before:
             time.sleep(self.delay)
 
         return html
 
@@ -0,0 +1,21 @@
+import requests
+import logging
+from functools import lru_cache
+from typing import Optional
+
+
+@lru_cache(maxsize=128)
+def cached_get(url: str, user_agent: str) -> Optional[str]:
+    print(f"[HTTP Request] Fetching from web: {url}")
+    headers = {"User-Agent": user_agent}
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None
+
+            
+
+
@@ -34,22 +34,3 @@ def main():
     main()
 
 
-
-
-# For testing cache
-# def main():
-#     scraper = CheaperScraper("https://books.toscrape.com")
-
-#     print("=== First Request ===")
-#     start = time.time()
-#     html1 = scraper.fetch("/")  # should print: [HTTP Request] ...
-#     print("Time taken:", round(time.time() - start, 2), "seconds\n")
-
-#     print("=== Second Request (Should Be Cached) ===")
-#     start = time.time()
-#     html2 = scraper.fetch("/")  # should NOT print: [HTTP Request] ...
-#     print("Time taken:", round(time.time() - start, 2), "seconds\n")
-#     print("Cache stats:", scraper._cached_get.cache_info())
-
-# if __name__ == "__main__":
-#     main()
@@ -0,0 +1,74 @@
+import unittest
+import time
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from src.fetch_utils import cached_get
+
+from src.Cheaper_Scraper import CheaperScraper
+
+#to test, be in the webscraper directory and use the following command in terminal
+# python src/tests/test_fetch_and_cache.py -v
+
+
+class TestCheaperScraperFetchCache(unittest.TestCase):
+    
+    def setUp(self):
+        self.scraper = CheaperScraper("https://books.toscrape.com")
+        cached_get.cache_clear()  # Reset cache before each test
+
+    def test_valid_fetch(self):
+        html = self.scraper.fetch("/")
+        self.assertIsInstance(html, str)
+        self.assertIn("<html", html.lower())
+
+    def test_invalid_path_fetch(self):
+        html = self.scraper.fetch("/this-page-does-not-exist")
+        # Even though it doesn't exist, the site may return a 200 with a 404 page
+        self.assertTrue(html is None or "<html" in html.lower())
+
+    def test_cache_effectiveness(self):
+        start = time.time()
+        self.scraper.fetch("/")  # First fetch
+        time1 = time.time() - start
+
+        start = time.time()
+        self.scraper.fetch("/")  # Second fetch (should be cached)
+        time2 = time.time() - start
+
+        cache_info = cached_get.cache_info()
+        self.assertLess(time2, time1)
+        self.assertGreaterEqual(cache_info.hits, 1)
+
+    def test_non_http_url(self):
+        with self.assertRaises(ValueError):
+            CheaperScraper("not_a_real_url")
+    
+    def test_cache_timing_and_stats(self):
+        print("\n=== Cache Timing and Stats Test ===")
+
+        # First fetch (expected to be slow and hit the network)
+        start = time.time()
+        html1 = self.scraper.fetch("/")
+        time1 = round(time.time() - start, 2)
+        print(f"First fetch took: {time1} seconds")
+
+        # Second fetch (expected to be fast due to cache)
+        start = time.time()
+        html2 = self.scraper.fetch("/")
+        time2 = round(time.time() - start, 2)
+        print(f"Second fetch took: {time2} seconds")
+
+        # Confirm that the second fetch was faster
+        self.assertLess(time2, time1, "Second fetch should be faster due to caching")
+
+        # Print and assert cache stats
+        stats = cached_get.cache_info()
+        print("Cache stats:", stats)
+        self.assertGreaterEqual(stats.hits, 1, "There should be at least 1 cache hit")
+
+
+
+
+if __name__ == "__main__":
+    unittest.main()