From 877e3e6aa4fe365dc0e9588abab863bfa2183c6e Mon Sep 17 00:00:00 2001 From: Nadaf Date: Wed, 10 Dec 2025 15:25:08 +0000 Subject: [PATCH 1/2] Add paper learner knowledge extraction --- requirements.txt | 1 + src/knowledge/learners/paper_learner.py | 38 +++++++++++++++++-------- tests/learners/test_paper_learner.py | 12 ++++++++ 3 files changed, 39 insertions(+), 12 deletions(-) create mode 100644 tests/learners/test_paper_learner.py diff --git a/requirements.txt b/requirements.txt index 22a55666..d567680a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ aider-chat>=0.35.0 # Knowledge graph database client neo4j +docling # Configuration file parsing PyYAML diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py index dbb1d173..4f9ff01b 100644 --- a/src/knowledge/learners/paper_learner.py +++ b/src/knowledge/learners/paper_learner.py @@ -5,6 +5,11 @@ from typing import Any, Dict, List +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions +from docling_core.types.doc.document import PictureDescriptionData + from src.knowledge.learners.base import Learner, KnowledgeChunk from src.knowledge.learners.factory import register_learner @@ -34,22 +39,32 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: Args: source_data: Dict with "path" (local file) or "url" (remote PDF) - Returns: List of KnowledgeChunk from the paper """ path = source_data.get("path", source_data.get("url", "")) - chunks = [] - - # TODO: Implement actual PDF parsing - # 1. Load PDF (local or download from URL) - # 2. Extract text using PyPDF2 or pdfplumber - # 3. Identify sections (Abstract, Methods, Results, etc.) - # 4. Extract formulas using OCR if needed - # 5. Create structured chunks per section - - # Placeholder: Create a single chunk indicating the source + smolvlm_picture_description = PictureDescriptionVlmOptions( + repo_id='HuggingFaceTB/SmolVLM-256M-Instruct', + prompt="Describe the picture in detail. Make sure to include all the details of the picture." + ) + pipeline_options = PdfPipelineOptions( + do_formula_enrichment = True, + do_picture_description = True, + ) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options + ) + } + ) + result = converter.convert(path) + markdown_content = doc.document.export_to_markdown() + print(markdown_content) + + chunks = [] chunks.append(KnowledgeChunk( content=f"Paper knowledge from {path}", chunk_type="text", @@ -59,4 +74,3 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: print(f"[PaperLearner] Learned from paper: {path}") return chunks - diff --git a/tests/learners/test_paper_learner.py b/tests/learners/test_paper_learner.py new file mode 100644 index 00000000..e0990472 --- /dev/null +++ b/tests/learners/test_paper_learner.py @@ -0,0 +1,12 @@ +from src.knowledge.learners.paper_learner import PaperLearner + + +def test_paper_learner(): + learner = PaperLearner(params={}) + + test_data = {"url": "https://arxiv.org/pdf/1706.03762"} + chunks = learner.learn(test_data) + print(chunks) + +if __name__ == "__main__": + test_paper_learner() From b6f68558f1599562ae1162ff6fbd6d858890705c Mon Sep 17 00:00:00 2001 From: Nadaf Date: Thu, 11 Dec 2025 12:00:03 +0000 Subject: [PATCH 2/2] Fix image description writer --- src/knowledge/learners/paper_learner.py | 46 +++++++++++++++++++------ 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py index 4f9ff01b..4dd2e9b9 100644 --- a/src/knowledge/learners/paper_learner.py +++ b/src/knowledge/learners/paper_learner.py @@ -3,17 +3,17 @@ # Extracts knowledge from research papers (PDFs). # Parses sections, abstracts, formulas, and key findings. +import os from typing import Any, Dict, List from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions, PictureDescriptionApiOptions from docling_core.types.doc.document import PictureDescriptionData from src.knowledge.learners.base import Learner, KnowledgeChunk from src.knowledge.learners.factory import register_learner - @register_learner("paper") class PaperLearner(Learner): """ @@ -43,14 +43,12 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: List of KnowledgeChunk from the paper """ path = source_data.get("path", source_data.get("url", "")) - - smolvlm_picture_description = PictureDescriptionVlmOptions( - repo_id='HuggingFaceTB/SmolVLM-256M-Instruct', - prompt="Describe the picture in detail. Make sure to include all the details of the picture." - ) + pipeline_options = PdfPipelineOptions( do_formula_enrichment = True, do_picture_description = True, + picture_description_options=self._create_picture_description_options(), + enable_remote_services=True, ) converter = DocumentConverter( @@ -61,9 +59,9 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: } ) result = converter.convert(path) - markdown_content = doc.document.export_to_markdown() - print(markdown_content) - + markdown_content = result.document.export_to_markdown() + + # TODO: Convert markdown to KG. chunks = [] chunks.append(KnowledgeChunk( content=f"Paper knowledge from {path}", @@ -74,3 +72,31 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]: print(f"[PaperLearner] Learned from paper: {path}") return chunks + + def _create_picture_description_options(self) -> PictureDescriptionApiOptions: + """ + Create the picture description options. + """ + + image_description_prompt = """ + Describe the picture in details. Make sure to include all the details, for exampel, convert flows and diagrams to text. + Ignore examples, and details of messy diagrams. Only extract and summarize the main content and idea of the picture. + put your description in the following format: + + Textual description of the picture. + + """ + # TODO: Add compatibility for other LLM provider APIs. + return PictureDescriptionApiOptions( + url="https://api.openai.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}" + }, + params=dict( + model="gpt-4o", + max_completion_tokens=500, + ), + prompt=image_description_prompt, + timeout=90, + ) +