ComplexData-MILA · dorsafsallami · Jun 9, 2026 · Jun 9, 2026
diff --git a/app/services/implementations/serper_web_search_service.py b/app/services/implementations/serper_web_search_service.py
@@ -1,19 +1,34 @@
-from typing import List, Optional
-import aiohttp
-from datetime import UTC, datetime
 import logging
+from datetime import UTC, datetime
+from typing import List, Optional
 from uuid import UUID, uuid4
-from app.core.config import settings
+
+import aiohttp
 from sqlalchemy.exc import IntegrityError
 
+from app.core.config import settings
 from app.core.exceptions import ValidationError
+from app.core.utils.url import normalize_domain_name
 from app.models.database.models import SourceModel
-from app.services.interfaces.web_search_service import WebSearchServiceInterface
 from app.repositories.implementations.source_repository import SourceRepository
 from app.services.domain_service import DomainService
-from app.core.utils.url import normalize_domain_name
+from app.services.interfaces.web_search_service import WebSearchServiceInterface
 
 logger = logging.getLogger(__name__)
+# hard filter
+BLOCKED_SOURCE_DOMAINS = {
+    "reddit.com",
+    "instagram.com",
+    "youtube.com",
+    "youtu.be",
+    "bsky.app",
+    "bsky.social",
+    "tiktok.com",
+    "facebook.com",
+    "x.com",
+    "twitter.com",
+    "threads.net",
+}
 
 
 class SerperWebSearchService(WebSearchServiceInterface):
@@ -23,11 +38,35 @@ def __init__(self, domain_service: DomainService, source_repository: SourceRepos
         self.domain_service = domain_service
         self.source_repository = source_repository
 
+    def _is_blocked_domain(self, domain_name: str) -> bool:
+        normalized_domain = normalize_domain_name(domain_name)
+        return any(
+            normalized_domain == blocked_domain or normalized_domain.endswith(f".{blocked_domain}")
+            for blocked_domain in BLOCKED_SOURCE_DOMAINS
+        )
+
+    def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool:
+        # Block explicit 0 credibility, but allow unscored domains for now.
+        # This avoids dropping credible sources just because they are missing from our domain DB.
+        return credibility_score is None or credibility_score > 0
+
+    def _is_allowed_source(self, source: SourceModel) -> bool:
+        domain_name = (
+            source.domain.domain_name
+            if hasattr(source, "domain") and source.domain
+            else normalize_domain_name(source.url)
+        )
+        return not self._is_blocked_domain(domain_name) and self._has_acceptable_credibility(source.credibility_score)
+
+    def _filter_allowed_sources(self, sources: List[SourceModel]) -> List[SourceModel]:
+        return [source for source in sources if self._is_allowed_source(source)]
+
     async def search_and_create_sources(
         self, claim_text: str, search_id: UUID, num_results: int = 5, language: str = "english"
     ) -> List[SourceModel]:
         """Search for sources and create or update records."""
         try:
+            logger.warning("SERPER SOURCE FILTER CODE IS RUNNING")
             payload = {"q": claim_text, "location": "Canada", "gl": "ca"}
             if language == "french":
                 payload["hl"] = "fr"
@@ -47,13 +86,25 @@ async def search_and_create_sources(
                         logger.warning("No search results found")
                         return []
 
-                    for item in data["organic"][:5]:
+                    for item in data["organic"][:10]:
                         try:
                             domain_name = normalize_domain_name(item["link"])
+                            # Blocked platforms are always excluded regardless of credibility score.
+                            if self._is_blocked_domain(domain_name):
+                                logger.info(f"Skipping blocked source domain: {domain_name}")
+                                continue
+
                             domain, is_new = await self.domain_service.get_or_create_domain(domain_name)
 
                             if is_new:
                                 logger.info(f"Created new domain record for: {domain_name}")
+                            if not self._has_acceptable_credibility(domain.credibility_score):
+                                logger.info(
+                                    f"Skipping source with zero credibility: "
+                                    f"{domain_name} ({domain.credibility_score})"
+                                )
+                                continue
+                            logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})")
 
                             source = await self._create_new_source(item, search_id, domain.id, domain.credibility_score)
                             if source:
@@ -108,6 +159,7 @@ async def _create_new_source(
 
     def format_sources_for_prompt(self, sources: List[SourceModel], language: str = "english") -> str:
         """Format sources into a string for the LLM prompt."""
+        sources = self._filter_allowed_sources(sources)
         if language == "english":
             if not sources:
                 return "No reliable sources found."
@@ -158,6 +210,8 @@ def format_sources_for_prompt(self, sources: List[SourceModel], language: str =
 
     def calculate_overall_credibility(self, sources: List[SourceModel]) -> float:
         """Calculate overall credibility score for a set of sources."""
+        sources = self._filter_allowed_sources(sources)
+
         if not sources:
             return 0.0
 

diff --git a/app/services/implementations/web_search_service.py b/app/services/implementations/web_search_service.py
@@ -49,7 +49,10 @@ def _is_blocked_domain(self, domain_name: str) -> bool:
 
     def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool:
         # This removes both unknown credibility and explicit 0 credibility.
-        return credibility_score is not None and credibility_score > 0
+        # return credibility_score is not None and credibility_score > 0
+        # Block explicit 0 credibility, but allow unscored domains for now.
+        # This avoids dropping credible sources just because they are missing from our domain DB.
+        return credibility_score is None or credibility_score > 0
 
     def _is_allowed_source(self, source: SourceModel) -> bool:
         domain_name = (
@@ -132,11 +135,11 @@ async def search_and_create_sources(
                                 logger.info(f"Created new domain record for: {domain_name}")
                             if not self._has_acceptable_credibility(domain.credibility_score):
                                 logger.info(
-                                    f"Skipping source with low/unknown credibility: "
+                                    f"Skipping source with zero credibility: "
                                     f"{domain_name} ({domain.credibility_score})"
                                 )
                                 continue
-
+                            logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})")
                             source = await self._create_new_source(
                                 item,
                                 search_id,