From 6bf5b2b9d943f424e3f53d8e25b55c6ac2cc9c31 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 28 Nov 2025 22:10:25 +0000 Subject: [PATCH 1/2] Initial plan From cf5d5bcc4fbd53260f36007c73dedc5c9d22f795 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 28 Nov 2025 22:13:12 +0000 Subject: [PATCH 2/2] Rename page parameter to page_url in extract_page_content for clarity Co-authored-by: xujiongze <31494901+xujiongze@users.noreply.github.com> --- python/crawler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/crawler.py b/python/crawler.py index f423518..833883f 100644 --- a/python/crawler.py +++ b/python/crawler.py @@ -158,13 +158,13 @@ async def get_noticias_urls(self, num=50): return noticias_urls - async def extract_page_content(self, page): + async def extract_page_content(self, page_url): self.logger.send_log( - message=f"Starting content extraction for page: {page}", + message=f"Starting content extraction for page: {page_url}", labels={"job": "web_crawler", "event": "page_content_extraction"} ) - await self.page.goto(page) + await self.page.goto(page_url) title = await self.page.query_selector('h1') content = await self.page.query_selector('#main-box') @@ -174,11 +174,11 @@ async def extract_page_content(self, page): page_data = { "title": title_text, "content": content_text, - "type": "noticia" if "latest-news" in page else "general", + "type": "noticia" if "latest-news" in page_url else "general", } os.makedirs("data", exist_ok=True) - filename = os.path.join("data", f"{page.replace('/', '_').replace(':', '')}.json") + filename = os.path.join("data", f"{page_url.replace('/', '_').replace(':', '')}.json") try: with open(filename, "w", encoding="utf-8") as f: