Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@ __pycache__/
.idea/
.vscode/
*.log

hf_cache
.venv
.git
__pycache__
**/__pycache__
**/.pytest_cache
**/.mypy_cache
infrastructure/terraform/.terraform
*.tfstate
*.tfstate.*
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ service-account.json
get-pip.py
.DS_Store
hf_cache/
infrastructure/deploy.ps1
16 changes: 16 additions & 0 deletions app/api/endpoints/media_endpoints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from fastapi import APIRouter, Depends, File, UploadFile

from app.api.dependencies import get_current_user
from app.models.domain.user import User
from app.services.openfake_service import verify_media_with_openfake

router = APIRouter(prefix="/media")


@router.post("/verify")
async def verify_media(
file: UploadFile = File(...),
# Only logged-in users can call this endpoint
current_user: User = Depends(get_current_user),
):
return await verify_media_with_openfake(file)
19 changes: 11 additions & 8 deletions app/api/router.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from fastapi import APIRouter

from app.api.endpoints import (
user_endpoints,
claim_endpoints,
analysis_endpoints,
source_endpoints,
search_endpoints,
feedback_endpoints,
claim_conversation_endpoints,
claim_endpoints,
conversation_endpoints,
message_endpoints,
discussion_endpoints,
domain_endpoints,
feedback_endpoints,
health_endpoints,
claim_conversation_endpoints,
discussion_endpoints,
media_endpoints,
message_endpoints,
post_endpoints,
search_endpoints,
source_endpoints,
user_endpoints,
)

router = APIRouter()
Expand All @@ -30,3 +32,4 @@
router.include_router(post_endpoints.router, tags=["posts"])
router.include_router(claim_conversation_endpoints.router, tags=["claim-conversations"])
router.include_router(health_endpoints.router, tags=["health"])
router.include_router(media_endpoints.router, tags=["media"])
9 changes: 7 additions & 2 deletions app/core/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import os
from functools import lru_cache
from typing import Optional

from pydantic_settings import BaseSettings # type: ignore
from functools import lru_cache
import logging

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,6 +41,10 @@ class Settings(BaseSettings):

DEBUG: bool = False

# For OpenFake
OPENFAKE_API_URL: str = "https://complexdatalab-openfakedemo.hf.space/api/predict"
MEDIA_VERIFICATION_ENABLED: bool = True

def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.DEBUG:
Expand Down
7 changes: 6 additions & 1 deletion app/core/llm/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class AnalysisPrompt:
"2. Ensure all special characters in the analysis text are properly escaped\n"
"3. The analysis field should be a single line with newlines represented as \\n\n"
"4. Do not include any control characters\n"
Before responding, check that your answer is valid JSON and matches this exact structure:
{"veracity_score":NUMBER,"analysis":"TEXT"}

"""

Expand All @@ -62,6 +64,8 @@ class AnalysisPrompt:
"2. Ensure all special characters in the analysis text are properly escaped\n"
"3. The analysis field should be a single line with newlines represented as \\n\n"
"4. Do not include any control characters\n"
Before responding, check that your answer is valid JSON and matches this exact structure:
{"veracity_score":NUMBER,"analysis":"TEXT"}

"""

Expand All @@ -82,7 +86,8 @@ class AnalysisPrompt:
"2. Assurez-vous que tous les caractères spéciaux dans le texte d'analyse sont correctement retranscrits\n"
"3. Le champ "analysis" doit être une seule ligne avec des nouvelles lignes représentées par \\n\n"
"4. N'ajoutez aucune entité de caractère\n"

Avant de répondre, vérifie que ta réponse est un JSON valide et qu’elle respecte exactement cette structure :
{"veracity_score":NUMBER,"analysis":"TEXT"}
"""

GET_CONFIDENCE = """
Expand Down
88 changes: 88 additions & 0 deletions app/core/text_safety.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import json
import logging
import re
from typing import Any

logger = logging.getLogger(__name__)


def clean_unicode_text(value: Any) -> str:
"""
Make text safe for JSON responses and PostgreSQL text fields.
"""
if value is None:
return ""

text = value if isinstance(value, str) else str(value)

text = text.replace("\x00", "")
text = text.replace("\x1a", "")

return text.encode("utf-8", errors="replace").decode("utf-8", errors="replace")


def extract_json_candidate(raw_text: str) -> str:
text = clean_unicode_text(raw_text).strip()

text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)

start = text.find("{")
end = text.rfind("}")

if start != -1 and end > start:
return text[start : end + 1]

return text


def parse_analysis_response(raw_text: str) -> tuple[int, str]:
candidate = extract_json_candidate(raw_text)

try:
data = json.loads(candidate)
score = int(float(data.get("veracity_score", 0)))
analysis = clean_unicode_text(data.get("analysis", "No analysis provided"))
return max(0, min(100, score)), analysis

except (json.JSONDecodeError, TypeError, ValueError) as e:
logger.warning(
"Malformed analysis JSON from LLM. Falling back to regex extraction. " "Error=%s Raw=%r",
e,
raw_text[:2000],
)

score_match = re.search(
r'"veracity_score"\s*:\s*([0-9]+(?:\.\d+)?)',
candidate,
)
score = int(float(score_match.group(1))) if score_match else 0
score = max(0, min(100, score))

analysis_match = re.search(
r'"analysis"\s*:\s*(.*)$',
candidate,
flags=re.DOTALL,
)

if analysis_match:
analysis = analysis_match.group(1).strip()
analysis = analysis.rstrip("}").rstrip(",").strip()

if analysis.startswith('"'):
analysis = analysis[1:]

if analysis.endswith('"'):
analysis = analysis[:-1]

analysis = analysis.replace('\\"', '"')
analysis = analysis.replace("\\n", "\n")
else:
analysis = candidate

analysis = clean_unicode_text(analysis)

if not analysis:
analysis = "Analysis was generated, but the response format was malformed."

return score, analysis
113 changes: 69 additions & 44 deletions app/services/analysis_orchestrator.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
import logging
from typing import AsyncGenerator, Dict, Any, List, Optional, NamedTuple
from uuid import UUID, uuid4
from datetime import UTC, datetime
import json
import logging
import math
import re
from copy import deepcopy
import math
from datetime import UTC, datetime
from typing import Any, AsyncGenerator, Dict, List, NamedTuple, Optional
from uuid import UUID, uuid4

from app.core.exceptions import NotAuthorizedException, NotFoundException, ValidationError
from app.core.llm.interfaces import LLMProvider
from app.core.llm.messages import Message as LLMMessage
from app.core.llm.prompts import AnalysisPrompt
from app.core.text_safety import clean_unicode_text, parse_analysis_response
from app.models.database.models import AnalysisStatus, ClaimStatus, ConversationStatus, MessageSenderType
from app.models.domain.claim import Claim
from app.models.domain.analysis import Analysis, LogProbsData
from app.models.domain.search import Search
from app.models.domain.message import Message
from app.core.llm.messages import Message as LLMMessage
from app.models.domain.conversation import Conversation
from app.models.domain.claim import Claim
from app.models.domain.claim_conversation import ClaimConversation
from app.models.domain.conversation import Conversation
from app.models.domain.message import Message
from app.models.domain.search import Search
from app.repositories.implementations.analysis_repository import AnalysisRepository
from app.repositories.implementations.claim_conversation_repository import ClaimConversationRepository
from app.repositories.implementations.claim_repository import ClaimRepository
from app.repositories.implementations.analysis_repository import AnalysisRepository
from app.repositories.implementations.message_repository import MessageRepository
from app.repositories.implementations.conversation_repository import ConversationRepository
from app.repositories.implementations.source_repository import SourceRepository
from app.repositories.implementations.message_repository import MessageRepository
from app.repositories.implementations.search_repository import SearchRepository
from app.repositories.implementations.source_repository import SourceRepository
from app.services.interfaces.web_search_service import WebSearchServiceInterface

from app.core.llm.prompts import AnalysisPrompt

logger = logging.getLogger(__name__)

console_handler = logging.StreamHandler()
Expand Down Expand Up @@ -218,38 +218,42 @@ async def _generate_analysis(
# logger.warning(f"length {len(log_probs)}, {log_probs}")

try:
# OLD VERSION KEPT FOR REFERENCE:
# Clean the text before parsing
# fmt: off
cleaned_text = (
full_text.strip()
.replace("\r", "") # Remove carriage returns
.replace("\x00", "") # Remove null bytes
.replace("\x1a", "") # Remove SUB characters
.replace("\n", "")
.replace("\\\'", "'")
.replace("\t", "")
)
# cleaned_text = (
# full_text.strip()
# .replace("\r", "") # Remove carriage returns
# .replace("\x00", "") # Remove null bytes
# .replace("\x1a", "") # Remove SUB characters
# .replace("\n", "")
# .replace("\\\'", "'")
# .replace("\t", "")
# )
# fmt: on
# Try to find the JSON object if there's additional text
try:
start_idx = cleaned_text.find("{")
end_idx = cleaned_text.rindex("}") + 1
if start_idx != -1 and end_idx != -1:
cleaned_text = cleaned_text[start_idx:end_idx]
except ValueError:
pass
# try:
# start_idx = cleaned_text.find("{")
# end_idx = cleaned_text.rindex("}") + 1
# if start_idx != -1 and end_idx != -1:
# cleaned_text = cleaned_text[start_idx:end_idx]
# except ValueError:
# pass

response_data = json.loads(cleaned_text)
# response_data = json.loads(cleaned_text)

logger.debug(response_data)
# logger.debug(response_data)

veracity_score = int(response_data.get("veracity_score", 0))
analysis_content = str(response_data.get("analysis", "No analysis provided"))
# veracity_score = int(response_data.get("veracity_score", 0))
# analysis_content = str(response_data.get("analysis", "No analysis provided"))

veracity_score = max(0, min(100, veracity_score))
# veracity_score = max(0, min(100, veracity_score))
veracity_score, analysis_content = parse_analysis_response(full_text)

current_analysis.veracity_score = float(veracity_score) / 100
current_analysis.analysis_text = analysis_content
# current_analysis.analysis_text = analysis_content

current_analysis.analysis_text = clean_unicode_text(analysis_content)
current_analysis.status = AnalysisStatus.completed.value
current_analysis.updated_at = datetime.now(UTC)

Expand Down Expand Up @@ -281,8 +285,8 @@ async def _generate_analysis(
import re

try:
veracity_match = re.search(r'"veracity_score"\s*:\s*([0-9]+(?:\.\d+)?)', cleaned_text)
analysis_match = re.search(r'"analysis"\s*:\s*"((?:[^"\\]|\\.)*)"', cleaned_text, re.DOTALL)
veracity_match = re.search(r'"veracity_score"\s*:\s*([0-9]+(?:\.\d+)?)', full_text)
analysis_match = re.search(r'"analysis"\s*:\s*"((?:[^"\\]|\\.)*)"', full_text, re.DOTALL)
logger.info("Successfully found regex matches")
if veracity_match and analysis_match:
veracity_score = int(veracity_match.group(1))
Expand Down Expand Up @@ -314,17 +318,38 @@ async def _generate_analysis(
yield {"type": "error", "content": f"Error parsing analysis response: {str(e)}"}
raise

# except Exception as e:
# logger.error(f"Error processing analysis: {str(e)}")
# current_analysis.status = AnalysisStatus.failed.value
# await self._analysis_repo.update(current_analysis)
# yield {"type": "error", "content": f"Error creating analysis: {str(e)}"}
# raise

except Exception as e:
logger.error(f"Error processing analysis: {str(e)}")
logger.error(
"Error processing analysis response: %s\nFull text: %r",
str(e),
full_text[:2000],
exc_info=True,
)

current_analysis.status = AnalysisStatus.failed.value
current_analysis.analysis_text = clean_unicode_text(full_text)
await self._analysis_repo.update(current_analysis)
yield {"type": "error", "content": f"Error creating analysis: {str(e)}"}
raise

yield {
"type": "error",
"content": "Sorry, we couldn’t complete the analysis. Please try again.",
}
return

except Exception as e:
logger.error(f"Error in _generate_analysis: {str(e)}", exc_info=True)
yield {"type": "error", "content": str(e)}
raise
yield {
"type": "error",
"content": "Sorry, we couldn’t complete the analysis. Please try again.",
}
return

async def initialize_claim_conversation(
self,
Expand Down
Loading
Loading