Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 61 additions & 7 deletions app/services/implementations/serper_web_search_service.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
from typing import List, Optional
import aiohttp
from datetime import UTC, datetime
import logging
from datetime import UTC, datetime
from typing import List, Optional
from uuid import UUID, uuid4
from app.core.config import settings

import aiohttp
from sqlalchemy.exc import IntegrityError

from app.core.config import settings
from app.core.exceptions import ValidationError
from app.core.utils.url import normalize_domain_name
from app.models.database.models import SourceModel
from app.services.interfaces.web_search_service import WebSearchServiceInterface
from app.repositories.implementations.source_repository import SourceRepository
from app.services.domain_service import DomainService
from app.core.utils.url import normalize_domain_name
from app.services.interfaces.web_search_service import WebSearchServiceInterface

logger = logging.getLogger(__name__)
# hard filter
BLOCKED_SOURCE_DOMAINS = {
"reddit.com",
"instagram.com",
"youtube.com",
"youtu.be",
"bsky.app",
"bsky.social",
"tiktok.com",
"facebook.com",
"x.com",
"twitter.com",
"threads.net",
}


class SerperWebSearchService(WebSearchServiceInterface):
Expand All @@ -23,11 +38,35 @@ def __init__(self, domain_service: DomainService, source_repository: SourceRepos
self.domain_service = domain_service
self.source_repository = source_repository

def _is_blocked_domain(self, domain_name: str) -> bool:
normalized_domain = normalize_domain_name(domain_name)
return any(
normalized_domain == blocked_domain or normalized_domain.endswith(f".{blocked_domain}")
for blocked_domain in BLOCKED_SOURCE_DOMAINS
)

def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool:
# Block explicit 0 credibility, but allow unscored domains for now.
# This avoids dropping credible sources just because they are missing from our domain DB.
return credibility_score is None or credibility_score > 0

def _is_allowed_source(self, source: SourceModel) -> bool:
domain_name = (
source.domain.domain_name
if hasattr(source, "domain") and source.domain
else normalize_domain_name(source.url)
)
return not self._is_blocked_domain(domain_name) and self._has_acceptable_credibility(source.credibility_score)

def _filter_allowed_sources(self, sources: List[SourceModel]) -> List[SourceModel]:
return [source for source in sources if self._is_allowed_source(source)]

async def search_and_create_sources(
self, claim_text: str, search_id: UUID, num_results: int = 5, language: str = "english"
) -> List[SourceModel]:
"""Search for sources and create or update records."""
try:
logger.warning("SERPER SOURCE FILTER CODE IS RUNNING")
payload = {"q": claim_text, "location": "Canada", "gl": "ca"}
if language == "french":
payload["hl"] = "fr"
Expand All @@ -47,13 +86,25 @@ async def search_and_create_sources(
logger.warning("No search results found")
return []

for item in data["organic"][:5]:
for item in data["organic"][:10]:
try:
domain_name = normalize_domain_name(item["link"])
# Blocked platforms are always excluded regardless of credibility score.
if self._is_blocked_domain(domain_name):
logger.info(f"Skipping blocked source domain: {domain_name}")
continue

domain, is_new = await self.domain_service.get_or_create_domain(domain_name)

if is_new:
logger.info(f"Created new domain record for: {domain_name}")
if not self._has_acceptable_credibility(domain.credibility_score):
logger.info(
f"Skipping source with zero credibility: "
f"{domain_name} ({domain.credibility_score})"
)
continue
logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})")

source = await self._create_new_source(item, search_id, domain.id, domain.credibility_score)
if source:
Expand Down Expand Up @@ -108,6 +159,7 @@ async def _create_new_source(

def format_sources_for_prompt(self, sources: List[SourceModel], language: str = "english") -> str:
"""Format sources into a string for the LLM prompt."""
sources = self._filter_allowed_sources(sources)
if language == "english":
if not sources:
return "No reliable sources found."
Expand Down Expand Up @@ -158,6 +210,8 @@ def format_sources_for_prompt(self, sources: List[SourceModel], language: str =

def calculate_overall_credibility(self, sources: List[SourceModel]) -> float:
"""Calculate overall credibility score for a set of sources."""
sources = self._filter_allowed_sources(sources)

if not sources:
return 0.0

Expand Down
9 changes: 6 additions & 3 deletions app/services/implementations/web_search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def _is_blocked_domain(self, domain_name: str) -> bool:

def _has_acceptable_credibility(self, credibility_score: Optional[float]) -> bool:
# This removes both unknown credibility and explicit 0 credibility.
return credibility_score is not None and credibility_score > 0
# return credibility_score is not None and credibility_score > 0
# Block explicit 0 credibility, but allow unscored domains for now.
# This avoids dropping credible sources just because they are missing from our domain DB.
return credibility_score is None or credibility_score > 0

def _is_allowed_source(self, source: SourceModel) -> bool:
domain_name = (
Expand Down Expand Up @@ -132,11 +135,11 @@ async def search_and_create_sources(
logger.info(f"Created new domain record for: {domain_name}")
if not self._has_acceptable_credibility(domain.credibility_score):
logger.info(
f"Skipping source with low/unknown credibility: "
f"Skipping source with zero credibility: "
f"{domain_name} ({domain.credibility_score})"
)
continue

logger.info(f"Allowing source: {domain_name} " f"(credibility={domain.credibility_score})")
source = await self._create_new_source(
item,
search_id,
Expand Down
Loading