diff --git a/requirements.txt b/requirements.txt index 22a55666..d567680a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ aider-chat>=0.35.0 # Knowledge graph database client neo4j +docling # Configuration file parsing PyYAML diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py index dbb1d173..4dd2e9b9 100644 --- a/src/knowledge/learners/paper_learner.py +++ b/src/knowledge/learners/paper_learner.py @@ -3,12 +3,17 @@ # Extracts knowledge from research papers (PDFs). # Parses sections, abstracts, formulas, and key findings. +import os from typing import Any, Dict, List +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions, PictureDescriptionApiOptions +from docling_core.types.doc.document import PictureDescriptionData + from src.knowledge.learners.base import Learner, KnowledgeChunk from src.knowledge.learners.factory import register_learner - @register_learner("paper") class PaperLearner(Learner): """ @@ -34,22 +39,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: Args: source_data: Dict with "path" (local file) or "url" (remote PDF) - Returns: List of KnowledgeChunk from the paper """ path = source_data.get("path", source_data.get("url", "")) + + pipeline_options = PdfPipelineOptions( + do_formula_enrichment = True, + do_picture_description = True, + picture_description_options=self._create_picture_description_options(), + enable_remote_services=True, + ) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options + ) + } + ) + result = converter.convert(path) + markdown_content = result.document.export_to_markdown() - chunks = [] - - # TODO: Implement actual PDF parsing - # 1. Load PDF (local or download from URL) - # 2. Extract text using PyPDF2 or pdfplumber - # 3. Identify sections (Abstract, Methods, Results, etc.) - # 4. Extract formulas using OCR if needed - # 5. Create structured chunks per section - - # Placeholder: Create a single chunk indicating the source + # TODO: Convert markdown to KG. + chunks = [] chunks.append(KnowledgeChunk( content=f"Paper knowledge from {path}", chunk_type="text", @@ -60,3 +73,30 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: print(f"[PaperLearner] Learned from paper: {path}") return chunks + def _create_picture_description_options(self) -> PictureDescriptionApiOptions: + """ + Create the picture description options. + """ + + image_description_prompt = """ + Describe the picture in details. Make sure to include all the details, for exampel, convert flows and diagrams to text. + Ignore examples, and details of messy diagrams. Only extract and summarize the main content and idea of the picture. + put your description in the following format: + + Textual description of the picture. + + """ + # TODO: Add compatibility for other LLM provider APIs. + return PictureDescriptionApiOptions( + url="https://api.openai.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}" + }, + params=dict( + model="gpt-4o", + max_completion_tokens=500, + ), + prompt=image_description_prompt, + timeout=90, + ) + diff --git a/tests/learners/test_paper_learner.py b/tests/learners/test_paper_learner.py new file mode 100644 index 00000000..e0990472 --- /dev/null +++ b/tests/learners/test_paper_learner.py @@ -0,0 +1,12 @@ +from src.knowledge.learners.paper_learner import PaperLearner + + +def test_paper_learner(): + learner = PaperLearner(params={}) + + test_data = {"url": "https://arxiv.org/pdf/1706.03762"} + chunks = learner.learn(test_data) + print(chunks) + +if __name__ == "__main__": + test_paper_learner()