From 877e3e6aa4fe365dc0e9588abab863bfa2183c6e Mon Sep 17 00:00:00 2001
From: Nadaf <nadaf@leeroo.com>
Date: Wed, 10 Dec 2025 15:25:08 +0000
Subject: [PATCH 1/2] Add paper learner knowledge extraction

---
 requirements.txt                        |  1 +
 src/knowledge/learners/paper_learner.py | 38 +++++++++++++++++--------
 tests/learners/test_paper_learner.py    | 12 ++++++++
 3 files changed, 39 insertions(+), 12 deletions(-)
 create mode 100644 tests/learners/test_paper_learner.py

diff --git a/requirements.txt b/requirements.txt
index 22a55666..d567680a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ aider-chat>=0.35.0
 
 # Knowledge graph database client
 neo4j
+docling
 
 # Configuration file parsing
 PyYAML
diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py
index dbb1d173..4f9ff01b 100644
--- a/src/knowledge/learners/paper_learner.py
+++ b/src/knowledge/learners/paper_learner.py
@@ -5,6 +5,11 @@
 
 from typing import Any, Dict, List
 
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions
+from docling_core.types.doc.document import PictureDescriptionData
+
 from src.knowledge.learners.base import Learner, KnowledgeChunk
 from src.knowledge.learners.factory import register_learner
 
@@ -34,22 +39,32 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
         
         Args:
             source_data: Dict with "path" (local file) or "url" (remote PDF)
-            
         Returns:
             List of KnowledgeChunk from the paper
         """
         path = source_data.get("path", source_data.get("url", ""))
         
-        chunks = []
-        
-        # TODO: Implement actual PDF parsing
-        # 1. Load PDF (local or download from URL)
-        # 2. Extract text using PyPDF2 or pdfplumber
-        # 3. Identify sections (Abstract, Methods, Results, etc.)
-        # 4. Extract formulas using OCR if needed
-        # 5. Create structured chunks per section
-        
-        # Placeholder: Create a single chunk indicating the source
+        smolvlm_picture_description = PictureDescriptionVlmOptions(
+            repo_id='HuggingFaceTB/SmolVLM-256M-Instruct',
+            prompt="Describe the picture in detail. Make sure to include all the details of the picture."
+        )
+        pipeline_options = PdfPipelineOptions(
+            do_formula_enrichment = True,
+            do_picture_description = True,
+        )
+
+        converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options
+                )
+            }
+        )
+        result = converter.convert(path)        
+        markdown_content = doc.document.export_to_markdown()
+        print(markdown_content)
+
+        chunks = []        
         chunks.append(KnowledgeChunk(
             content=f"Paper knowledge from {path}",
             chunk_type="text",
@@ -59,4 +74,3 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
         
         print(f"[PaperLearner] Learned from paper: {path}")
         return chunks
-
diff --git a/tests/learners/test_paper_learner.py b/tests/learners/test_paper_learner.py
new file mode 100644
index 00000000..e0990472
--- /dev/null
+++ b/tests/learners/test_paper_learner.py
@@ -0,0 +1,12 @@
+from src.knowledge.learners.paper_learner import PaperLearner
+
+
+def test_paper_learner():
+    learner = PaperLearner(params={})
+    
+    test_data = {"url": "https://arxiv.org/pdf/1706.03762"}
+    chunks = learner.learn(test_data)
+    print(chunks)
+
+if __name__ == "__main__":
+    test_paper_learner()

From b6f68558f1599562ae1162ff6fbd6d858890705c Mon Sep 17 00:00:00 2001
From: Nadaf <nadaf@leeroo.com>
Date: Thu, 11 Dec 2025 12:00:03 +0000
Subject: [PATCH 2/2] Fix image description writer

---
 src/knowledge/learners/paper_learner.py | 46 +++++++++++++++++++------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/knowledge/learners/paper_learner.py b/src/knowledge/learners/paper_learner.py
index 4f9ff01b..4dd2e9b9 100644
--- a/src/knowledge/learners/paper_learner.py
+++ b/src/knowledge/learners/paper_learner.py
@@ -3,17 +3,17 @@
 # Extracts knowledge from research papers (PDFs).
 # Parses sections, abstracts, formulas, and key findings.
 
+import os
 from typing import Any, Dict, List
 
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions, PictureDescriptionApiOptions
 from docling_core.types.doc.document import PictureDescriptionData
 
 from src.knowledge.learners.base import Learner, KnowledgeChunk
 from src.knowledge.learners.factory import register_learner
 
-
 @register_learner("paper")
 class PaperLearner(Learner):
     """
@@ -43,14 +43,12 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
             List of KnowledgeChunk from the paper
         """
         path = source_data.get("path", source_data.get("url", ""))
-        
-        smolvlm_picture_description = PictureDescriptionVlmOptions(
-            repo_id='HuggingFaceTB/SmolVLM-256M-Instruct',
-            prompt="Describe the picture in detail. Make sure to include all the details of the picture."
-        )
+
         pipeline_options = PdfPipelineOptions(
             do_formula_enrichment = True,
             do_picture_description = True,
+            picture_description_options=self._create_picture_description_options(),
+            enable_remote_services=True,
         )
 
         converter = DocumentConverter(
@@ -61,9 +59,9 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
             }
         )
         result = converter.convert(path)        
-        markdown_content = doc.document.export_to_markdown()
-        print(markdown_content)
-
+        markdown_content = result.document.export_to_markdown()
+        
+        # TODO: Convert markdown to KG.
         chunks = []        
         chunks.append(KnowledgeChunk(
             content=f"Paper knowledge from {path}",
@@ -74,3 +72,31 @@ def learn(self, source_data: Dict[str, Any]) -> List[KnowledgeChunk]:
         
         print(f"[PaperLearner] Learned from paper: {path}")
         return chunks
+
+    def _create_picture_description_options(self) -> PictureDescriptionApiOptions:
+        """
+        Create the picture description options.
+        """
+        
+        image_description_prompt = """
+            Describe the picture in details. Make sure to include all the details, for exampel, convert flows and diagrams to text.
+            Ignore examples, and details of messy diagrams. Only extract and summarize the main content and idea of the picture. 
+            put your description in the following format:
+            <image_description>
+                Textual description of the picture.
+            </image_description>
+        """
+        # TODO: Add compatibility for other LLM provider APIs.
+        return PictureDescriptionApiOptions(
+            url="https://api.openai.com/v1/chat/completions",
+            headers={
+                "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
+            },
+            params=dict(
+                model="gpt-4o",
+                max_completion_tokens=500,
+            ),
+            prompt=image_description_prompt,
+            timeout=90,
+        )
+