From 63bd356145eff9407f3f51f0571b5ef5c5d2e9d9 Mon Sep 17 00:00:00 2001
From: Balaxxe <136368465+Balaxxe@users.noreply.github.com>
Date: Tue, 19 May 2026 12:23:52 -0700
Subject: [PATCH 1/4] Use thread-local MarkItDown; add doc classifier

Switch MarkItDown caching to thread-local instances (threading.local) so concurrent workers can convert without serializing on a single global instance; initialization errors are remembered per-thread and reset_markitdown_instance clears the thread-local cache. Remove the global convert lock and allow concurrent convert/convert_stream calls on thread-local instances. Change PDF-to-image conversion to use convert_from_path with paths_only (write files directly to disk) and rename temporary outputs to expected page_###.<ext> names to avoid large in-memory image objects. Add classify_document_type() in mistral_converter with filename/text heuristics and optional LLM fallback, and route dynamic document schema selection through _ocr_shared_optional_params/build_ocr_process_kwargs/process_with_ocr/_prepare_batch_entries by accepting file_path and doc_type. Update tests to reflect thread-local behavior, on-disk PDF conversion (expect paths and paths_only flag), and document classification routing.
---
 local_converter.py              | 104 +++++++++++++---------------
 mistral_converter.py            | 116 +++++++++++++++++++++++++++++++-
 tests/test_local_converter.py   |  99 +++++++++++++++++++++------
 tests/test_mistral_converter.py |  47 +++++++++++++
 4 files changed, 284 insertions(+), 82 deletions(-)
diff --git a/local_converter.py b/local_converter.py
index 8f6a4ab..c495190 100644
--- a/local_converter.py
+++ b/local_converter.py
@@ -69,8 +69,8 @@
 
 _MARKITDOWN_UNSET = object()  # sentinel: init never attempted
 _markitdown_instance = _MARKITDOWN_UNSET
+_markitdown_instances = threading.local()
 _markitdown_lock = threading.Lock()
-_markitdown_convert_lock = threading.Lock()
 
 
 def _build_markitdown_kwargs() -> Dict[str, Any]:
@@ -110,42 +110,39 @@ def _build_markitdown_kwargs() -> Dict[str, Any]:
 
 def get_markitdown_instance() -> Optional[MarkItDown]:
     """
-    Create and configure a MarkItDown instance (thread-safe).
+    Create and configure a MarkItDown instance (thread-safe, thread-local).
 
-    Uses a module-level cache protected by a lock so concurrent threads
-    in batch processing don't race on initialization.  A failed init is
-    remembered (instance set to ``None``) so subsequent calls return
-    immediately without retrying or logging duplicate errors.
+    Uses a thread-local cache so concurrent threads in batch processing
+    don't serialize on conversion tasks.
     """
     global _markitdown_instance
 
-    cached = _markitdown_instance
-    if cached is not _MARKITDOWN_UNSET:
-        return cached  # either a live instance or None (failed init)
+    if not hasattr(_markitdown_instances, "instance"):
+        with _markitdown_lock:
+            if MarkItDown is None:
+                logger.error("MarkItDown not installed. Install with: pip install markitdown")
+                _markitdown_instances.instance = None
+                _markitdown_instance = None
+                return None
 
-    with _markitdown_lock:
-        if _markitdown_instance is not _MARKITDOWN_UNSET:
-            return _markitdown_instance  # pragma: no cover
-
-        if MarkItDown is None:
-            logger.error("MarkItDown not installed. Install with: pip install markitdown")
-            _markitdown_instance = None
-            return None
-
-        try:
-            _markitdown_instance = MarkItDown(**_build_markitdown_kwargs())
-            return _markitdown_instance
+            try:
+                instance = MarkItDown(**_build_markitdown_kwargs())
+                _markitdown_instances.instance = instance
+                _markitdown_instance = instance
+            except Exception as e:
+                logger.error("Error initializing MarkItDown: %s", e)
+                _markitdown_instances.instance = None
+                _markitdown_instance = None
 
-        except Exception as e:
-            logger.error("Error initializing MarkItDown: %s", e)
-            _markitdown_instance = None  # remember the failure
-            return None
+    return _markitdown_instances.instance
 
 
 def reset_markitdown_instance():
     """Reset the cached MarkItDown instance so the next call retries initialization."""
     global _markitdown_instance
     with _markitdown_lock:
+        if hasattr(_markitdown_instances, "instance"):
+            delattr(_markitdown_instances, "instance")
         _markitdown_instance = _MARKITDOWN_UNSET
 
 
@@ -177,10 +174,8 @@ def convert_with_markitdown(
     try:
         logger.info("Converting with MarkItDown: %s", file_path.name)
 
-        # Serialize convert() — MarkItDown is not documented as thread-safe for
-        # concurrent converts while we share one cached instance across workers.
-        with _markitdown_convert_lock:
-            result = md.convert(str(file_path))
+        # Convert concurrently using thread-local MarkItDown instance
+        result = md.convert(str(file_path))
 
         if result and hasattr(result, "markdown"):
             markdown_content = result.markdown
@@ -273,17 +268,16 @@ def convert_stream_with_markitdown(
 
     try:
         logger.info("Converting stream with MarkItDown: %s", filename)
-        with _markitdown_convert_lock:
-            stream_info = None
-            if StreamInfo is not None:
-                stream_info = StreamInfo(
-                    extension=Path(filename).suffix,
-                    filename=filename,
-                )
-            if stream_info is not None:
-                result = md.convert_stream(stream, stream_info=stream_info)
-            else:
-                result = md.convert_stream(stream, file_extension=Path(filename).suffix)
+        stream_info = None
+        if StreamInfo is not None:
+            stream_info = StreamInfo(
+                extension=Path(filename).suffix,
+                filename=filename,
+            )
+        if stream_info is not None:
+            result = md.convert_stream(stream, stream_info=stream_info)
+        else:
+            result = md.convert_stream(stream, file_extension=Path(filename).suffix)
 
         if result and hasattr(result, "markdown"):
             return True, result.markdown, None
@@ -770,7 +764,7 @@ def convert_pdf_to_images(
         # Configure poppler path for Windows
         poppler_path = (config.POPPLER_PATH or None) if sys.platform == "win32" else None
 
-        # Build conversion parameters
+        # Build conversion parameters (using paths_only to avoid RAM spikes on large PDFs)
         convert_params = {
             "pdf_path": str(pdf_path),
             "dpi": dpi,
@@ -779,28 +773,24 @@ def convert_pdf_to_images(
             "poppler_path": poppler_path,
             "thread_count": max(1, thread_count),
             "use_pdftocairo": config.PDF_IMAGE_USE_PDFTOCAIRO,
+            "output_file": "page",
+            "paths_only": True,
         }
 
-        # Convert PDF to images
-        images = convert_from_path(**convert_params)
+        # Convert PDF to images directly on disk
+        temp_paths = convert_from_path(**convert_params)
 
-        # Save images
+        # Rename output files to preserve expected page_001.ext structure
         image_paths = []
         file_extension = "jpg" if config.PDF_IMAGE_FORMAT == "jpeg" else config.PDF_IMAGE_FORMAT
 
-        for i, image in enumerate(images, 1):
-            image_path = output_dir / f"page_{i:03d}.{file_extension}"
-
-            # Save with format-specific options
-            if config.PDF_IMAGE_FORMAT == "jpeg":
-                image.save(str(image_path), "JPEG", quality=85, optimize=True, progressive=True)
-            elif config.PDF_IMAGE_FORMAT == "png":
-                image.save(str(image_path), "PNG", optimize=True)
-            else:
-                image.save(str(image_path), config.PDF_IMAGE_FORMAT.upper())
-
-            image_paths.append(image_path)
-            logger.debug("Saved page %d to %s", i, image_path.name)
+        for i, temp_path_str in enumerate(temp_paths, 1):
+            temp_path = Path(temp_path_str)
+            target_path = output_dir / f"page_{i:03d}.{file_extension}"
+            if temp_path.exists() and temp_path != target_path:
+                temp_path.replace(target_path)
+            image_paths.append(target_path)
+            logger.debug("Saved page %d to %s", i, target_path.name)
 
         logger.info(
             "Converted %d pages to %s images",
diff --git a/mistral_converter.py b/mistral_converter.py
index 8f22b32..6466520 100644
--- a/mistral_converter.py
+++ b/mistral_converter.py
@@ -953,11 +953,117 @@ def _cleanup_temp_files(temp_files: List[Path]) -> None:
             logger.warning("Could not delete temporary file %s: %s", temp_file.name, e)
 
 
-def _ocr_shared_optional_params() -> Dict[str, Any]:
+def classify_document_type(file_path: Path) -> str:
+    """Classify document type into generic, invoice, financial_statement, contract, or form.
+
+    Uses a hybrid approach:
+    1. Regex checks on filename/path.
+    2. Parsing first-page text (if PDF or text file) for key identifiers.
+    3. Cheap LLM classification fallback (using ministral-8b-latest or mistral-small-latest).
+    """
+    name = file_path.name.lower()
+
+    # 1. Filename heuristic
+    if any(w in name for w in ["invoice", "receipt", "bill"]):
+        logger.debug("Classified %s as 'invoice' via filename", file_path.name)
+        return "invoice"
+    if any(w in name for w in ["contract", "agreement", "nda", "lease"]):
+        logger.debug("Classified %s as 'contract' via filename", file_path.name)
+        return "contract"
+    if any(w in name for w in ["statement", "financial", "balance_sheet", "income"]):
+        logger.debug("Classified %s as 'financial_statement' via filename", file_path.name)
+        return "financial_statement"
+    if any(w in name for w in ["form", "w9", "w2", "tax"]):
+        logger.debug("Classified %s as 'form' via filename", file_path.name)
+        return "form"
+
+    # 2. First-page text content check (for text PDFs/txt files)
+    ext = file_path.suffix.lower().lstrip(".")
+    first_text = ""
+
+    if ext == "pdf":
+        try:
+            import pdfplumber
+
+            with pdfplumber.open(file_path) as pdf:
+                if pdf.pages:
+                    first_text = (pdf.pages[0].extract_text() or "")[:1000]
+        except Exception as e:
+            logger.debug("Could not extract PDF text for classification: %s", e)
+    elif ext == "txt":
+        try:
+            first_text = file_path.read_text(errors="ignore")[:1000]
+        except Exception as e:
+            logger.debug("Could not read text file for classification: %s", e)
+
+    if first_text:
+        first_text_lower = first_text.lower()
+        if any(w in first_text_lower for w in ["invoice", "receipt", "purchase order", "amount due"]):
+            logger.debug("Classified %s as 'invoice' via page 1 text", file_path.name)
+            return "invoice"
+        if any(w in first_text_lower for w in ["agreement", "contract", "parties", "hereby", "undersigned"]):
+            logger.debug("Classified %s as 'contract' via page 1 text", file_path.name)
+            return "contract"
+        if any(
+            w in first_text_lower
+            for w in ["statement of", "balance sheet", "cash flow", "income statement", "assets", "liabilities"]
+        ):
+            logger.debug("Classified %s as 'financial_statement' via page 1 text", file_path.name)
+            return "financial_statement"
+        if any(w in first_text_lower for w in ["form ", "w-9", "w-2", "tax return", "filer"]):
+            logger.debug("Classified %s as 'form' via page 1 text", file_path.name)
+            return "form"
+
+    # 3. LLM fallback check (if API key is present)
+    if config.MISTRAL_API_KEY:
+        try:
+            client = get_mistral_client()
+            if client:
+                prompt = (
+                    f"Classify the document '{file_path.name}' into one of these types: "
+                    "invoice, financial_statement, contract, form, generic.\n"
+                    "Reply with only the lowercase name of the type.\n"
+                )
+                if first_text:
+                    prompt += f"First page excerpt:\n{first_text[:500]}\n"
+
+                model = config.MISTRAL_DOCUMENT_QNA_MODEL or "ministral-8b-latest"
+                response = client.chat.complete(
+                    model=model,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=10,
+                )
+                if response and response.choices:
+                    result = response.choices[0].message.content.strip().lower()
+                    for t in ["invoice", "financial_statement", "contract", "form"]:
+                        if t in result:
+                            logger.debug("Classified %s as '%s' via LLM", file_path.name, t)
+                            return t
+        except Exception as e:
+            logger.debug("LLM classification check failed: %s", e)
+
+    logger.debug("Classified %s as 'generic'", file_path.name)
+    return "generic"
+
+
+def _ocr_shared_optional_params(file_path: Optional[Path] = None, doc_type: str = "auto") -> Dict[str, Any]:
     """OCR fields shared by sync ``ocr.process`` and batch JSONL ``body``."""
     fields: Dict[str, Any] = {}
     bbox_format = get_bbox_annotation_format()
-    doc_format = get_document_annotation_format()
+
+    # Resolve dynamic classification if doc_type is "auto"
+    resolved_doc_type = doc_type
+    if resolved_doc_type == "auto":
+        nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE
+        if nested == "auto":
+            if file_path is not None:
+                resolved_doc_type = classify_document_type(file_path)
+            else:
+                resolved_doc_type = "generic"
+        else:
+            resolved_doc_type = nested
+
+    doc_format = get_document_annotation_format(doc_type=resolved_doc_type)
     if bbox_format is not None:
         fields["bbox_annotation_format"] = bbox_format
     if doc_format is not None:
@@ -982,6 +1088,8 @@ def build_ocr_process_kwargs(
     include_retries: bool,
     pages: Optional[List[int]] = None,
     request_id: Optional[str] = None,
+    file_path: Optional[Path] = None,
+    doc_type: str = "auto",
 ) -> Dict[str, Any]:
     """Build kwargs for ``client.ocr.process`` or a batch JSONL line ``body``."""
     ocr_params: Dict[str, Any] = {
@@ -995,7 +1103,7 @@ def build_ocr_process_kwargs(
         ocr_params["id"] = request_id
     if pages is not None:
         ocr_params["pages"] = pages
-    ocr_params.update(_ocr_shared_optional_params())
+    ocr_params.update(_ocr_shared_optional_params(file_path=file_path, doc_type=doc_type))
     return ocr_params
 
 
@@ -1130,6 +1238,7 @@ def _report_progress(message: str, progress: float = 0.0):
             include_retries=True,
             pages=pages,
             request_id=ocr_id,
+            file_path=file_path,
         )
 
         response = client.ocr.process(**ocr_params)
@@ -2521,6 +2630,7 @@ def _prepare_batch_entries(
             include_retries=False,
             pages=None,
             request_id=custom_id,
+            file_path=file_path,
         )
         body["include_image_base64"] = include_image_base64
 
diff --git a/tests/test_local_converter.py b/tests/test_local_converter.py
index 9bde21c..efb331a 100644
--- a/tests/test_local_converter.py
+++ b/tests/test_local_converter.py
@@ -277,6 +277,26 @@ def test_cached_on_second_call(self):
         second = local_converter.get_markitdown_instance()
         assert first is second
 
+    def test_thread_local_isolation(self):
+        local_converter.reset_markitdown_instance()
+        import threading
+
+        instances = []
+
+        def worker():
+            inst = local_converter.get_markitdown_instance()
+            instances.append(inst)
+
+        t1 = threading.Thread(target=worker)
+        t2 = threading.Thread(target=worker)
+        t1.start()
+        t2.start()
+        t1.join()
+        t2.join()
+
+        if instances[0] is not None and instances[1] is not None:
+            assert instances[0] is not instances[1]
+
 
 # ============================================================================
 # convert_with_markitdown Tests (mocked)
@@ -582,6 +602,7 @@ def test_not_installed(self, tmp_path):
         assert "not installed" in error.lower() or "not available" in error.lower()
 
     def test_successful_conversion(self, tmp_path, monkeypatch):
+        output_dir = tmp_path / "test_pages"
         monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path)
         monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200)
         monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "png")
@@ -592,15 +613,27 @@ def test_successful_conversion(self, tmp_path, monkeypatch):
         pdf_file = tmp_path / "test.pdf"
         pdf_file.write_bytes(b"%PDF-1.4")
 
-        # Create mock PIL Image objects
-        mock_img1 = MagicMock()
-        mock_img2 = MagicMock()
+        from pathlib import Path
 
-        with patch.object(local_converter, "convert_from_path", return_value=[mock_img1, mock_img2]):
-            success, paths, error = local_converter.convert_pdf_to_images(pdf_file)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        temp_file1 = str(output_dir / "page-1.png")
+        temp_file2 = str(output_dir / "page-2.png")
+        Path(temp_file1).touch()
+        Path(temp_file2).touch()
+
+        with patch.object(local_converter, "convert_from_path", return_value=[temp_file1, temp_file2]) as mock_convert:
+            success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir)
 
         assert success is True
         assert len(paths) == 2
+        assert paths[0] == output_dir / "page_001.png"
+        assert paths[1] == output_dir / "page_002.png"
+        assert (output_dir / "page_001.png").exists()
+        assert (output_dir / "page_002.png").exists()
+
+        mock_convert.assert_called_once()
+        kwargs = mock_convert.call_args[1]
+        assert kwargs["paths_only"] is True
 
     def test_handles_conversion_error(self, tmp_path, monkeypatch):
         monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path)
@@ -1113,6 +1146,7 @@ class TestConvertPdfToImagesPng:
     """Line 688: PNG format save branch."""
 
     def test_png_format(self, tmp_path, monkeypatch):
+        output_dir = tmp_path / "png_pages"
         monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path)
         monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200)
         monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "png")
@@ -1123,19 +1157,25 @@ def test_png_format(self, tmp_path, monkeypatch):
         pdf_file = tmp_path / "test.pdf"
         pdf_file.write_bytes(b"%PDF-1.4")
 
-        mock_img = MagicMock()
+        from pathlib import Path
 
-        with patch.object(local_converter, "convert_from_path", return_value=[mock_img]):
-            success, paths, error = local_converter.convert_pdf_to_images(pdf_file)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        temp_file = str(output_dir / "page-1.png")
+        Path(temp_file).touch()
+
+        with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert:
+            success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir)
 
         assert success is True
-        # Verify PNG save was called
-        mock_img.save.assert_called_once()
-        call_args = mock_img.save.call_args
-        assert "PNG" in call_args[0]
+        assert paths[0] == output_dir / "page_001.png"
+        mock_convert.assert_called_once()
+        kwargs = mock_convert.call_args[1]
+        assert kwargs["paths_only"] is True
+        assert kwargs["fmt"] == "png"
 
     def test_jpeg_format(self, tmp_path, monkeypatch):
         """Verify JPEG format branch for completeness."""
+        output_dir = tmp_path / "jpeg_pages"
         monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path)
         monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200)
         monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "jpeg")
@@ -1146,14 +1186,21 @@ def test_jpeg_format(self, tmp_path, monkeypatch):
         pdf_file = tmp_path / "test.pdf"
         pdf_file.write_bytes(b"%PDF-1.4")
 
-        mock_img = MagicMock()
+        from pathlib import Path
 
-        with patch.object(local_converter, "convert_from_path", return_value=[mock_img]):
-            success, paths, error = local_converter.convert_pdf_to_images(pdf_file)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        temp_file = str(output_dir / "page-1.jpg")
+        Path(temp_file).touch()
+
+        with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert:
+            success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir)
 
         assert success is True
-        call_args = mock_img.save.call_args
-        assert "JPEG" in call_args[0]
+        assert paths[0] == output_dir / "page_001.jpg"
+        mock_convert.assert_called_once()
+        kwargs = mock_convert.call_args[1]
+        assert kwargs["paths_only"] is True
+        assert kwargs["fmt"] == "jpeg"
 
 
 # ============================================================================
@@ -1357,6 +1404,7 @@ class TestConvertPdfToImagesOtherFormat:
 
     def test_tiff_format(self, tmp_path, monkeypatch):
         """Other format branch uses PDF_IMAGE_FORMAT.upper()."""
+        output_dir = tmp_path / "tiff_pages"
         monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path)
         monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200)
         monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "tiff")
@@ -1367,14 +1415,21 @@ def test_tiff_format(self, tmp_path, monkeypatch):
         pdf_file = tmp_path / "test.pdf"
         pdf_file.write_bytes(b"%PDF-1.4")
 
-        mock_img = MagicMock()
+        from pathlib import Path
 
-        with patch.object(local_converter, "convert_from_path", return_value=[mock_img]):
-            success, paths, error = local_converter.convert_pdf_to_images(pdf_file)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        temp_file = str(output_dir / "page-1.tiff")
+        Path(temp_file).touch()
+
+        with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert:
+            success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir)
 
         assert success is True
-        call_args = mock_img.save.call_args
-        assert "TIFF" in call_args[0]
+        assert paths[0] == output_dir / "page_001.tiff"
+        mock_convert.assert_called_once()
+        kwargs = mock_convert.call_args[1]
+        assert kwargs["paths_only"] is True
+        assert kwargs["fmt"] == "tiff"
 
 
 if __name__ == "__main__":
diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py
index 55c21cd..5352050 100644
--- a/tests/test_mistral_converter.py
+++ b/tests/test_mistral_converter.py
@@ -4288,3 +4288,50 @@ def test_mix_of_weak_and_strong(self, monkeypatch):
         assert 0 in weak  # "short" is weak
         assert 2 in weak  # empty is weak
         assert 3 not in weak  # long diverse text is not weak
+
+
+# ============================================================================
+# document classification routing tests
+# ============================================================================
+
+
+class TestDocumentClassificationRouting:
+    """Test classify_document_type and dynamic classification schema routing."""
+
+    def test_classify_by_filename(self, tmp_path):
+        f1 = tmp_path / "invoice_123.pdf"
+        assert mistral_converter.classify_document_type(f1) == "invoice"
+
+        f2 = tmp_path / "lease_agreement.docx"
+        assert mistral_converter.classify_document_type(f2) == "contract"
+
+        f3 = tmp_path / "balance_sheet_2025.xlsx"
+        assert mistral_converter.classify_document_type(f3) == "financial_statement"
+
+        f4 = tmp_path / "w9_form.pdf"
+        assert mistral_converter.classify_document_type(f4) == "form"
+
+        f5 = tmp_path / "notes.txt"
+        assert mistral_converter.classify_document_type(f5) == "generic"
+
+    def test_classify_by_text_content(self, tmp_path):
+        # Text file classification
+        f = tmp_path / "doc.txt"
+        f.write_text("This agreement is entered into by the parties undersigned...")
+        assert mistral_converter.classify_document_type(f) == "contract"
+
+        f2 = tmp_path / "doc2.txt"
+        f2.write_text("Total Amount Due: $1,200.50 Invoice Number: INV-091")
+        assert mistral_converter.classify_document_type(f2) == "invoice"
+
+    def test_shared_optional_params_routing(self, monkeypatch):
+        # When MISTRAL_DOCUMENT_SCHEMA_TYPE is "auto", classification is triggered
+        monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto")
+        monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", True)
+
+        # Mock classify_document_type to return "invoice"
+        with patch.object(mistral_converter, "classify_document_type", return_value="invoice") as mock_classify:
+            with patch.object(mistral_converter, "get_document_annotation_format") as mock_fmt:
+                mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf"))
+                mock_classify.assert_called_once_with(Path("my_file.pdf"))
+                mock_fmt.assert_called_once_with(doc_type="invoice")

From f6e00c3df5744283727e5abad6f9f62e9214a204 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 19 May 2026 19:37:57 +0000
Subject: [PATCH 2/4] Fix document classification cache invalidation bugs

---
 local_converter.py              | 23 ++++++++++++++---
 mistral_converter.py            | 13 +++++++---
 tests/test_local_converter.py   | 45 ++++++++++++++++++++++++++++++---
 tests/test_mistral_converter.py |  8 ++++++
 4 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/local_converter.py b/local_converter.py
index c495190..876f266 100644
--- a/local_converter.py
+++ b/local_converter.py
@@ -70,6 +70,7 @@
 _MARKITDOWN_UNSET = object()  # sentinel: init never attempted
 _markitdown_instance = _MARKITDOWN_UNSET
 _markitdown_instances = threading.local()
+_markitdown_generation = 0
 _markitdown_lock = threading.Lock()
 
 
@@ -117,32 +118,48 @@ def get_markitdown_instance() -> Optional[MarkItDown]:
     """
     global _markitdown_instance
 
-    if not hasattr(_markitdown_instances, "instance"):
+    if (
+        not hasattr(_markitdown_instances, "instance")
+        or getattr(_markitdown_instances, "generation", None) != _markitdown_generation
+    ):
         with _markitdown_lock:
+            current_generation = _markitdown_generation
+            if (
+                hasattr(_markitdown_instances, "instance")
+                and getattr(_markitdown_instances, "generation", None) == current_generation
+            ):
+                return _markitdown_instances.instance
+
             if MarkItDown is None:
                 logger.error("MarkItDown not installed. Install with: pip install markitdown")
                 _markitdown_instances.instance = None
+                _markitdown_instances.generation = current_generation
                 _markitdown_instance = None
                 return None
 
             try:
                 instance = MarkItDown(**_build_markitdown_kwargs())
                 _markitdown_instances.instance = instance
+                _markitdown_instances.generation = current_generation
                 _markitdown_instance = instance
             except Exception as e:
                 logger.error("Error initializing MarkItDown: %s", e)
                 _markitdown_instances.instance = None
+                _markitdown_instances.generation = current_generation
                 _markitdown_instance = None
 
     return _markitdown_instances.instance
 
 
-def reset_markitdown_instance():
+def reset_markitdown_instance() -> None:
     """Reset the cached MarkItDown instance so the next call retries initialization."""
-    global _markitdown_instance
+    global _markitdown_generation, _markitdown_instance
     with _markitdown_lock:
         if hasattr(_markitdown_instances, "instance"):
             delattr(_markitdown_instances, "instance")
+        if hasattr(_markitdown_instances, "generation"):
+            delattr(_markitdown_instances, "generation")
+        _markitdown_generation += 1
         _markitdown_instance = _MARKITDOWN_UNSET
 
 
diff --git a/mistral_converter.py b/mistral_converter.py
index 6466520..c0c2fcf 100644
--- a/mistral_converter.py
+++ b/mistral_converter.py
@@ -953,6 +953,11 @@ def _cleanup_temp_files(temp_files: List[Path]) -> None:
             logger.warning("Could not delete temporary file %s: %s", temp_file.name, e)
 
 
+def _filename_has_keyword(name: str, keywords: List[str]) -> bool:
+    """Return True when a keyword appears as a filename token."""
+    return any(re.search(rf"(?<![a-z0-9]){re.escape(keyword)}(?![a-z0-9])", name) for keyword in keywords)
+
+
 def classify_document_type(file_path: Path) -> str:
     """Classify document type into generic, invoice, financial_statement, contract, or form.
 
@@ -964,16 +969,16 @@ def classify_document_type(file_path: Path) -> str:
     name = file_path.name.lower()
 
     # 1. Filename heuristic
-    if any(w in name for w in ["invoice", "receipt", "bill"]):
+    if _filename_has_keyword(name, ["invoice", "receipt", "bill"]):
         logger.debug("Classified %s as 'invoice' via filename", file_path.name)
         return "invoice"
-    if any(w in name for w in ["contract", "agreement", "nda", "lease"]):
+    if _filename_has_keyword(name, ["contract", "agreement", "nda", "lease"]):
         logger.debug("Classified %s as 'contract' via filename", file_path.name)
         return "contract"
-    if any(w in name for w in ["statement", "financial", "balance_sheet", "income"]):
+    if _filename_has_keyword(name, ["statement", "financial", "balance_sheet", "income"]):
         logger.debug("Classified %s as 'financial_statement' via filename", file_path.name)
         return "financial_statement"
-    if any(w in name for w in ["form", "w9", "w2", "tax"]):
+    if _filename_has_keyword(name, ["form", "w9", "w2", "tax"]):
         logger.debug("Classified %s as 'form' via filename", file_path.name)
         return "form"
 
diff --git a/tests/test_local_converter.py b/tests/test_local_converter.py
index efb331a..046b075 100644
--- a/tests/test_local_converter.py
+++ b/tests/test_local_converter.py
@@ -19,7 +19,9 @@
 """
 
 import io
+import queue
 import sys
+import threading
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -279,7 +281,6 @@ def test_cached_on_second_call(self):
 
     def test_thread_local_isolation(self):
         local_converter.reset_markitdown_instance()
-        import threading
 
         instances = []
 
@@ -297,6 +298,46 @@ def worker():
         if instances[0] is not None and instances[1] is not None:
             assert instances[0] is not instances[1]
 
+    def test_reset_invalidates_other_thread_cache(self):
+        local_converter.reset_markitdown_instance()
+
+        class FakeMarkItDown:
+            created = 0
+
+            def __init__(self, **kwargs):
+                type(self).created += 1
+                self.created = type(self).created
+
+        commands = queue.Queue()
+        responses = queue.Queue()
+
+        def worker():
+            while True:
+                command = commands.get(timeout=5)
+                if command == "stop":
+                    return
+                responses.put(local_converter.get_markitdown_instance())
+
+        with patch.object(local_converter, "MarkItDown", FakeMarkItDown):
+            worker_thread = threading.Thread(target=worker)
+            worker_thread.start()
+            try:
+                commands.put("get")
+                first = responses.get(timeout=5)
+
+                local_converter.reset_markitdown_instance()
+
+                commands.put("get")
+                second = responses.get(timeout=5)
+            finally:
+                commands.put("stop")
+                worker_thread.join(timeout=5)
+                local_converter.reset_markitdown_instance()
+
+        assert first is not second
+        assert first.created == 1
+        assert second.created == 2
+
 
 # ============================================================================
 # convert_with_markitdown Tests (mocked)
@@ -889,8 +930,6 @@ def test_init_exception(self, monkeypatch):
 
     def test_concurrent_lock_returns_cached(self):
         """Line 95: second call inside lock finds instance already set by first call."""
-        import threading
-
         local_converter.reset_markitdown_instance()
 
         results = [None, None]
diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py
index 5352050..c5242a7 100644
--- a/tests/test_mistral_converter.py
+++ b/tests/test_mistral_converter.py
@@ -4314,6 +4314,14 @@ def test_classify_by_filename(self, tmp_path):
         f5 = tmp_path / "notes.txt"
         assert mistral_converter.classify_document_type(f5) == "generic"
 
+    def test_classify_by_filename_uses_token_boundaries(self, tmp_path, monkeypatch):
+        monkeypatch.setattr(config, "MISTRAL_API_KEY", "")
+
+        assert mistral_converter.classify_document_type(tmp_path / "information.pdf") == "generic"
+        assert mistral_converter.classify_document_type(tmp_path / "monday_notes.pdf") == "generic"
+        assert mistral_converter.classify_document_type(tmp_path / "taxonomy.pdf") == "generic"
+        assert mistral_converter.classify_document_type(tmp_path / "billboard.pdf") == "generic"
+
     def test_classify_by_text_content(self, tmp_path):
         # Text file classification
         f = tmp_path / "doc.txt"

From 8fea2ddfbff4ee50330f508dbec59cced09be2e7 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 19 May 2026 20:02:52 +0000
Subject: [PATCH 3/4] Remove thread-local MarkItDown double-check

---
 local_converter.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/local_converter.py b/local_converter.py
index 876f266..96bca15 100644
--- a/local_converter.py
+++ b/local_converter.py
@@ -124,11 +124,6 @@ def get_markitdown_instance() -> Optional[MarkItDown]:
     ):
         with _markitdown_lock:
             current_generation = _markitdown_generation
-            if (
-                hasattr(_markitdown_instances, "instance")
-                and getattr(_markitdown_instances, "generation", None) == current_generation
-            ):
-                return _markitdown_instances.instance
 
             if MarkItDown is None:
                 logger.error("MarkItDown not installed. Install with: pip install markitdown")

From 4293a6ea3272faa3f86b5a4f6c14001811e0b2a4 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 19 May 2026 20:18:24 +0000
Subject: [PATCH 4/4] Avoid unused document type classification

---
 mistral_converter.py            | 24 +++++++++++++-----------
 tests/test_mistral_converter.py | 12 ++++++++++++
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/mistral_converter.py b/mistral_converter.py
index c0c2fcf..db8081e 100644
--- a/mistral_converter.py
+++ b/mistral_converter.py
@@ -1056,19 +1056,21 @@ def _ocr_shared_optional_params(file_path: Optional[Path] = None, doc_type: str
     fields: Dict[str, Any] = {}
     bbox_format = get_bbox_annotation_format()
 
-    # Resolve dynamic classification if doc_type is "auto"
-    resolved_doc_type = doc_type
-    if resolved_doc_type == "auto":
-        nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE
-        if nested == "auto":
-            if file_path is not None:
-                resolved_doc_type = classify_document_type(file_path)
+    doc_format = None
+    if config.MISTRAL_ENABLE_STRUCTURED_OUTPUT and config.MISTRAL_ENABLE_DOCUMENT_ANNOTATION:
+        # Resolve dynamic classification only when document annotation can use it.
+        resolved_doc_type = doc_type
+        if resolved_doc_type == "auto":
+            nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE
+            if nested == "auto":
+                if file_path is not None:
+                    resolved_doc_type = classify_document_type(file_path)
+                else:
+                    resolved_doc_type = "generic"
             else:
-                resolved_doc_type = "generic"
-        else:
-            resolved_doc_type = nested
+                resolved_doc_type = nested
 
-    doc_format = get_document_annotation_format(doc_type=resolved_doc_type)
+        doc_format = get_document_annotation_format(doc_type=resolved_doc_type)
     if bbox_format is not None:
         fields["bbox_annotation_format"] = bbox_format
     if doc_format is not None:
diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py
index c5242a7..bb7e299 100644
--- a/tests/test_mistral_converter.py
+++ b/tests/test_mistral_converter.py
@@ -4335,6 +4335,7 @@ def test_classify_by_text_content(self, tmp_path):
     def test_shared_optional_params_routing(self, monkeypatch):
         # When MISTRAL_DOCUMENT_SCHEMA_TYPE is "auto", classification is triggered
         monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto")
+        monkeypatch.setattr(config, "MISTRAL_ENABLE_STRUCTURED_OUTPUT", True)
         monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", True)
 
         # Mock classify_document_type to return "invoice"
@@ -4343,3 +4344,14 @@ def test_shared_optional_params_routing(self, monkeypatch):
                 mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf"))
                 mock_classify.assert_called_once_with(Path("my_file.pdf"))
                 mock_fmt.assert_called_once_with(doc_type="invoice")
+
+    def test_shared_optional_params_skips_classification_when_document_annotation_disabled(self, monkeypatch):
+        monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto")
+        monkeypatch.setattr(config, "MISTRAL_ENABLE_STRUCTURED_OUTPUT", True)
+        monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", False)
+
+        with patch.object(mistral_converter, "classify_document_type", return_value="invoice") as mock_classify:
+            with patch.object(mistral_converter, "get_document_annotation_format") as mock_fmt:
+                mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf"))
+                mock_classify.assert_not_called()
+                mock_fmt.assert_not_called()