diff --git a/app/services/implementations/serper_web_search_service.py b/app/services/implementations/serper_web_search_service.py index 4fe0f3b..7b83baf 100644 --- a/app/services/implementations/serper_web_search_service.py +++ b/app/services/implementations/serper_web_search_service.py @@ -1,19 +1,34 @@ -from typing import List, Optional -import aiohttp -from datetime import UTC, datetime import logging +from datetime import UTC, datetime +from typing import List, Optional from uuid import UUID, uuid4 -from app.core.config import settings + +import aiohttp from sqlalchemy.exc import IntegrityError +from app.core.config import settings from app.core.exceptions import ValidationError +from app.core.utils.url import normalize_domain_name from app.models.database.models import SourceModel -from app.services.interfaces.web_search_service import WebSearchServiceInterface from app.repositories.implementations.source_repository import SourceRepository from app.services.domain_service import DomainService -from app.core.utils.url import normalize_domain_name +from app.services.interfaces.web_search_service import WebSearchServiceInterface logger = logging.getLogger(__name__) +# hard filter +BLOCKED_SOURCE_DOMAINS = { + "reddit.com", + "instagram.com", + "youtube.com", + "youtu.be", + "bsky.app", + "bsky.social", + "tiktok.com", + "facebook.com", + "x.com", + "twitter.com", + "threads.net", +} class SerperWebSearchService(WebSearchServiceInterface): @@ -23,11 +38,35 @@ def __init__(self, domain_service: DomainService, source_repository: SourceRepos self.domain_service = domain_service self.source_repository = source_repository + def _is_blocked_domain(self, domain_name: str) -> bool: + normalized_domain = normalize_domain_name(domain_name) + return any( + normalized_domain == blocked_domain or normalized_domain.endswith(f".{blocked_domain}") + for blocked_domain in BLOCKED_SOURCE_DOMAINS + ) + + def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool: + # Block explicit 0 credibility, but allow unscored domains for now. + # This avoids dropping credible sources just because they are missing from our domain DB. + return credibility_score is None or credibility_score > 0 + + def _is_allowed_source(self, source: SourceModel) -> bool: + domain_name = ( + source.domain.domain_name + if hasattr(source, "domain") and source.domain + else normalize_domain_name(source.url) + ) + return not self._is_blocked_domain(domain_name) and self._has_acceptable_credibility(source.credibility_score) + + def _filter_allowed_sources(self, sources: List[SourceModel]) -> List[SourceModel]: + return [source for source in sources if self._is_allowed_source(source)] + async def search_and_create_sources( self, claim_text: str, search_id: UUID, num_results: int = 5, language: str = "english" ) -> List[SourceModel]: """Search for sources and create or update records.""" try: + logger.warning("SERPER SOURCE FILTER CODE IS RUNNING") payload = {"q": claim_text, "location": "Canada", "gl": "ca"} if language == "french": payload["hl"] = "fr" @@ -47,13 +86,25 @@ async def search_and_create_sources( logger.warning("No search results found") return [] - for item in data["organic"][:5]: + for item in data["organic"][:10]: try: domain_name = normalize_domain_name(item["link"]) + # Blocked platforms are always excluded regardless of credibility score. + if self._is_blocked_domain(domain_name): + logger.info(f"Skipping blocked source domain: {domain_name}") + continue + domain, is_new = await self.domain_service.get_or_create_domain(domain_name) if is_new: logger.info(f"Created new domain record for: {domain_name}") + if not self._has_acceptable_credibility(domain.credibility_score): + logger.info( + f"Skipping source with zero credibility: " + f"{domain_name} ({domain.credibility_score})" + ) + continue + logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})") source = await self._create_new_source(item, search_id, domain.id, domain.credibility_score) if source: @@ -108,6 +159,7 @@ async def _create_new_source( def format_sources_for_prompt(self, sources: List[SourceModel], language: str = "english") -> str: """Format sources into a string for the LLM prompt.""" + sources = self._filter_allowed_sources(sources) if language == "english": if not sources: return "No reliable sources found." @@ -158,6 +210,8 @@ def format_sources_for_prompt(self, sources: List[SourceModel], language: str = def calculate_overall_credibility(self, sources: List[SourceModel]) -> float: """Calculate overall credibility score for a set of sources.""" + sources = self._filter_allowed_sources(sources) + if not sources: return 0.0 diff --git a/app/services/implementations/web_search_service.py b/app/services/implementations/web_search_service.py index ffe1a79..f4bdd9f 100644 --- a/app/services/implementations/web_search_service.py +++ b/app/services/implementations/web_search_service.py @@ -49,7 +49,10 @@ def _is_blocked_domain(self, domain_name: str) -> bool: def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool: # This removes both unknown credibility and explicit 0 credibility. - return credibility_score is not None and credibility_score > 0 + # return credibility_score is not None and credibility_score > 0 + # Block explicit 0 credibility, but allow unscored domains for now. + # This avoids dropping credible sources just because they are missing from our domain DB. + return credibility_score is None or credibility_score > 0 def _is_allowed_source(self, source: SourceModel) -> bool: domain_name = ( @@ -132,11 +135,11 @@ async def search_and_create_sources( logger.info(f"Created new domain record for: {domain_name}") if not self._has_acceptable_credibility(domain.credibility_score): logger.info( - f"Skipping source with low/unknown credibility: " + f"Skipping source with zero credibility: " f"{domain_name} ({domain.credibility_score})" ) continue - + logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})") source = await self._create_new_source( item, search_id,