hackrx-app/main.py at main · Sainvi-j/hackrx-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import hashlib
import time
from fastapi import FastAPI, HTTPException, Header
from pydantic import BaseModel
from typing import List
import pdfplumber
import docx
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
from pinecone import Pinecone, ServerlessSpec
import requests
from urllib.parse import urlparse
import logging
import psutil

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="LLM-Powered Query-Retrieval System")

@app.get("/")
@app.head("/")
async def health_check():
    return {"status": "ok"}

PINECONE_INDEX_NAME = "policy-index"
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(PINECONE_INDEX_NAME)

tokenizer = None
embed_model = None
qa_model = None

def init_models():
    global tokenizer, embed_model, qa_model
    if tokenizer is None:
        logger.info(f"Memory before model load: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
        try:
            tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", model_max_length=16)
            embed_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
            logger.info(f"Memory after model load: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
            logger.info("Models initialized")
        except Exception as e:
            logger.error(f"Model initialization failed: {e}")
            raise

class QueryRequest(BaseModel):
    documents: str
    questions: List[str]

class QueryResponse(BaseModel):
    answers: List[str]

def extract_text_from_pdf(url: str) -> str:
    start_time = time.time()
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        with pdfplumber.open(response.raw) as pdf:
            text = "".join(page.extract_text() or "" for page in pdf.pages)
        logger.info(f"PDF extraction successful, took {time.time() - start_time:.2f}s, pages: {len(pdf.pages)}")
        return text
    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        raise HTTPException(status_code=500, detail=f"PDF processing failed: {e}")

def extract_text_from_docx(url: str) -> str:
    start_time = time.time()
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        with open("temp.docx", "wb") as f:
            f.write(response.content)
        doc = docx.Document("temp.docx")
        text = "\n".join(para.text for para in doc.paragraphs)
        os.remove("temp.docx")
        logger.info(f"DOCX extraction took {time.time() - start_time:.2f}s")
        return text
    except Exception as e:
        logger.error(f"DOCX error: {e}")
        raise HTTPException(status_code=500, detail="Failed to process DOCX")

def get_doc_id(url: str) -> str:
    return hashlib.md5(url.encode()).hexdigest()

def process_and_embed_document(text: str, doc_id: str):
    start_time = time.time()
    existing = index.query(
        vector=[0] * 384,
        top_k=1,
        filter={"doc_id": doc_id},
        include_metadata=True
    )
    if existing["matches"]:
        logger.info(f"Document {doc_id} already embedded, skipping. Cache check took {time.time() - start_time:.2f}s")
        return
    chunk_size = 15
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-6)]
    batch_size = 1
    init_models()
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=16)
        with torch.no_grad():
            embeddings = embed_model(**inputs).last_hidden_state.mean(dim=1).numpy()
        vectors = [
            (f"{doc_id}_chunk_{i+j}", embedding.tolist(), {"text": chunk, "doc_id": doc_id})
            for j, (chunk, embedding) in enumerate(zip(batch, embeddings))
        ]
        index.upsert(vectors=vectors)
    logger.info(f"Embedded {len(chunks)} chunks for document {doc_id} in {time.time() - start_time:.2f}s")

def semantic_search(query: str, top_k: int = 1) -> List[dict]:
    global tokenizer, embed_model
    if tokenizer is None:
        init_models()
    start_time = time.time()
    logger.info(f"Memory before semantic search: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=16)
    with torch.no_grad():
        query_embedding = embed_model(**inputs).last_hidden_state.mean(dim=1).numpy()[0]
    for attempt in range(3):
        try:
            results = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
            logger.info(f"Semantic search took {time.time() - start_time:.2f}s")
            logger.info(f"Memory after semantic search: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
            return results["matches"]
        except Exception as e:
            logger.error(f"Search attempt {attempt + 1} failed: {e}")
            if attempt == 2:
                raise
            time.sleep(1)
    return []

def process_query(query: str, relevant_chunks: List[dict]) -> str:
    start_time = time.time()
    logger.info(f"Memory before request: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
    if psutil.virtual_memory().percent > 95:
        raise HTTPException(status_code=500, detail="Memory overload")
    context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)
    if not context:
        return "No relevant information found in the document."
    result = qa_model(question=query, context=context)
    answer = result["answer"]
    score = result["score"]
    explanation = f"Answer: {answer} (Confidence: {score:.2f}). Based on document context: {context[:200]}..."
    logger.info(f"Query processing took {time.time() - start_time:.2f}s")
    return explanation

@app.post("/api/v1/hackrx/run", response_model=QueryResponse)
async def run_query(request: QueryRequest, authorization: str = Header(...)):
    start_time = time.time()
    expected_token = f"Bearer {os.environ.get('HACKRX_BEARER_TOKEN')}"
    if authorization != expected_token:
        raise HTTPException(status_code=401, detail="Invalid authorization token")
    doc_url = request.documents
    file_extension = os.path.splitext(urlparse(doc_url).path)[1].lower()
    doc_id = get_doc_id(doc_url)
    logger.info(f"Processing document ID: {doc_id}")
    existing = index.query(
        vector=[0] * 384,
        top_k=1,
        filter={"doc_id": doc_id},
        include_metadata=True
    )
    if not existing["matches"]:
        if file_extension == ".pdf":
            text = extract_text_from_pdf(doc_url)
        elif file_extension == ".docx":
            text = extract_text_from_docx(doc_url)
        else:
            raise HTTPException(status_code=400, detail="Unsupported file format")
        process_and_embed_document(text, doc_id)
    answers = []
    for question in request.questions:
        relevant_chunks = semantic_search(question)
        answer = process_query(question, relevant_chunks)
        answers.append(answer)
    logger.info(f"Total request took {time.time() - start_time:.2f}s")
    logger.info(f"Memory after request: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
    return QueryResponse(answers=answers)

if __name__ == "__main__":
    import uvicorn
    port = int(os.environ.get("PORT", 8000))
    logger.info(f"Running uvicorn on port {port}")
    uvicorn.run(app, host="0.0.0.0", port=port)