From 63bd356145eff9407f3f51f0571b5ef5c5d2e9d9 Mon Sep 17 00:00:00 2001 From: Balaxxe <136368465+Balaxxe@users.noreply.github.com> Date: Tue, 19 May 2026 12:23:52 -0700 Subject: [PATCH 1/4] Use thread-local MarkItDown; add doc classifier Switch MarkItDown caching to thread-local instances (threading.local) so concurrent workers can convert without serializing on a single global instance; initialization errors are remembered per-thread and reset_markitdown_instance clears the thread-local cache. Remove the global convert lock and allow concurrent convert/convert_stream calls on thread-local instances. Change PDF-to-image conversion to use convert_from_path with paths_only (write files directly to disk) and rename temporary outputs to expected page_###. names to avoid large in-memory image objects. Add classify_document_type() in mistral_converter with filename/text heuristics and optional LLM fallback, and route dynamic document schema selection through _ocr_shared_optional_params/build_ocr_process_kwargs/process_with_ocr/_prepare_batch_entries by accepting file_path and doc_type. Update tests to reflect thread-local behavior, on-disk PDF conversion (expect paths and paths_only flag), and document classification routing. --- local_converter.py | 104 +++++++++++++--------------- mistral_converter.py | 116 +++++++++++++++++++++++++++++++- tests/test_local_converter.py | 99 +++++++++++++++++++++------ tests/test_mistral_converter.py | 47 +++++++++++++ 4 files changed, 284 insertions(+), 82 deletions(-) diff --git a/local_converter.py b/local_converter.py index 8f6a4ab..c495190 100644 --- a/local_converter.py +++ b/local_converter.py @@ -69,8 +69,8 @@ _MARKITDOWN_UNSET = object() # sentinel: init never attempted _markitdown_instance = _MARKITDOWN_UNSET +_markitdown_instances = threading.local() _markitdown_lock = threading.Lock() -_markitdown_convert_lock = threading.Lock() def _build_markitdown_kwargs() -> Dict[str, Any]: @@ -110,42 +110,39 @@ def _build_markitdown_kwargs() -> Dict[str, Any]: def get_markitdown_instance() -> Optional[MarkItDown]: """ - Create and configure a MarkItDown instance (thread-safe). + Create and configure a MarkItDown instance (thread-safe, thread-local). - Uses a module-level cache protected by a lock so concurrent threads - in batch processing don't race on initialization. A failed init is - remembered (instance set to ``None``) so subsequent calls return - immediately without retrying or logging duplicate errors. + Uses a thread-local cache so concurrent threads in batch processing + don't serialize on conversion tasks. """ global _markitdown_instance - cached = _markitdown_instance - if cached is not _MARKITDOWN_UNSET: - return cached # either a live instance or None (failed init) + if not hasattr(_markitdown_instances, "instance"): + with _markitdown_lock: + if MarkItDown is None: + logger.error("MarkItDown not installed. Install with: pip install markitdown") + _markitdown_instances.instance = None + _markitdown_instance = None + return None - with _markitdown_lock: - if _markitdown_instance is not _MARKITDOWN_UNSET: - return _markitdown_instance # pragma: no cover - - if MarkItDown is None: - logger.error("MarkItDown not installed. Install with: pip install markitdown") - _markitdown_instance = None - return None - - try: - _markitdown_instance = MarkItDown(**_build_markitdown_kwargs()) - return _markitdown_instance + try: + instance = MarkItDown(**_build_markitdown_kwargs()) + _markitdown_instances.instance = instance + _markitdown_instance = instance + except Exception as e: + logger.error("Error initializing MarkItDown: %s", e) + _markitdown_instances.instance = None + _markitdown_instance = None - except Exception as e: - logger.error("Error initializing MarkItDown: %s", e) - _markitdown_instance = None # remember the failure - return None + return _markitdown_instances.instance def reset_markitdown_instance(): """Reset the cached MarkItDown instance so the next call retries initialization.""" global _markitdown_instance with _markitdown_lock: + if hasattr(_markitdown_instances, "instance"): + delattr(_markitdown_instances, "instance") _markitdown_instance = _MARKITDOWN_UNSET @@ -177,10 +174,8 @@ def convert_with_markitdown( try: logger.info("Converting with MarkItDown: %s", file_path.name) - # Serialize convert() — MarkItDown is not documented as thread-safe for - # concurrent converts while we share one cached instance across workers. - with _markitdown_convert_lock: - result = md.convert(str(file_path)) + # Convert concurrently using thread-local MarkItDown instance + result = md.convert(str(file_path)) if result and hasattr(result, "markdown"): markdown_content = result.markdown @@ -273,17 +268,16 @@ def convert_stream_with_markitdown( try: logger.info("Converting stream with MarkItDown: %s", filename) - with _markitdown_convert_lock: - stream_info = None - if StreamInfo is not None: - stream_info = StreamInfo( - extension=Path(filename).suffix, - filename=filename, - ) - if stream_info is not None: - result = md.convert_stream(stream, stream_info=stream_info) - else: - result = md.convert_stream(stream, file_extension=Path(filename).suffix) + stream_info = None + if StreamInfo is not None: + stream_info = StreamInfo( + extension=Path(filename).suffix, + filename=filename, + ) + if stream_info is not None: + result = md.convert_stream(stream, stream_info=stream_info) + else: + result = md.convert_stream(stream, file_extension=Path(filename).suffix) if result and hasattr(result, "markdown"): return True, result.markdown, None @@ -770,7 +764,7 @@ def convert_pdf_to_images( # Configure poppler path for Windows poppler_path = (config.POPPLER_PATH or None) if sys.platform == "win32" else None - # Build conversion parameters + # Build conversion parameters (using paths_only to avoid RAM spikes on large PDFs) convert_params = { "pdf_path": str(pdf_path), "dpi": dpi, @@ -779,28 +773,24 @@ def convert_pdf_to_images( "poppler_path": poppler_path, "thread_count": max(1, thread_count), "use_pdftocairo": config.PDF_IMAGE_USE_PDFTOCAIRO, + "output_file": "page", + "paths_only": True, } - # Convert PDF to images - images = convert_from_path(**convert_params) + # Convert PDF to images directly on disk + temp_paths = convert_from_path(**convert_params) - # Save images + # Rename output files to preserve expected page_001.ext structure image_paths = [] file_extension = "jpg" if config.PDF_IMAGE_FORMAT == "jpeg" else config.PDF_IMAGE_FORMAT - for i, image in enumerate(images, 1): - image_path = output_dir / f"page_{i:03d}.{file_extension}" - - # Save with format-specific options - if config.PDF_IMAGE_FORMAT == "jpeg": - image.save(str(image_path), "JPEG", quality=85, optimize=True, progressive=True) - elif config.PDF_IMAGE_FORMAT == "png": - image.save(str(image_path), "PNG", optimize=True) - else: - image.save(str(image_path), config.PDF_IMAGE_FORMAT.upper()) - - image_paths.append(image_path) - logger.debug("Saved page %d to %s", i, image_path.name) + for i, temp_path_str in enumerate(temp_paths, 1): + temp_path = Path(temp_path_str) + target_path = output_dir / f"page_{i:03d}.{file_extension}" + if temp_path.exists() and temp_path != target_path: + temp_path.replace(target_path) + image_paths.append(target_path) + logger.debug("Saved page %d to %s", i, target_path.name) logger.info( "Converted %d pages to %s images", diff --git a/mistral_converter.py b/mistral_converter.py index 8f22b32..6466520 100644 --- a/mistral_converter.py +++ b/mistral_converter.py @@ -953,11 +953,117 @@ def _cleanup_temp_files(temp_files: List[Path]) -> None: logger.warning("Could not delete temporary file %s: %s", temp_file.name, e) -def _ocr_shared_optional_params() -> Dict[str, Any]: +def classify_document_type(file_path: Path) -> str: + """Classify document type into generic, invoice, financial_statement, contract, or form. + + Uses a hybrid approach: + 1. Regex checks on filename/path. + 2. Parsing first-page text (if PDF or text file) for key identifiers. + 3. Cheap LLM classification fallback (using ministral-8b-latest or mistral-small-latest). + """ + name = file_path.name.lower() + + # 1. Filename heuristic + if any(w in name for w in ["invoice", "receipt", "bill"]): + logger.debug("Classified %s as 'invoice' via filename", file_path.name) + return "invoice" + if any(w in name for w in ["contract", "agreement", "nda", "lease"]): + logger.debug("Classified %s as 'contract' via filename", file_path.name) + return "contract" + if any(w in name for w in ["statement", "financial", "balance_sheet", "income"]): + logger.debug("Classified %s as 'financial_statement' via filename", file_path.name) + return "financial_statement" + if any(w in name for w in ["form", "w9", "w2", "tax"]): + logger.debug("Classified %s as 'form' via filename", file_path.name) + return "form" + + # 2. First-page text content check (for text PDFs/txt files) + ext = file_path.suffix.lower().lstrip(".") + first_text = "" + + if ext == "pdf": + try: + import pdfplumber + + with pdfplumber.open(file_path) as pdf: + if pdf.pages: + first_text = (pdf.pages[0].extract_text() or "")[:1000] + except Exception as e: + logger.debug("Could not extract PDF text for classification: %s", e) + elif ext == "txt": + try: + first_text = file_path.read_text(errors="ignore")[:1000] + except Exception as e: + logger.debug("Could not read text file for classification: %s", e) + + if first_text: + first_text_lower = first_text.lower() + if any(w in first_text_lower for w in ["invoice", "receipt", "purchase order", "amount due"]): + logger.debug("Classified %s as 'invoice' via page 1 text", file_path.name) + return "invoice" + if any(w in first_text_lower for w in ["agreement", "contract", "parties", "hereby", "undersigned"]): + logger.debug("Classified %s as 'contract' via page 1 text", file_path.name) + return "contract" + if any( + w in first_text_lower + for w in ["statement of", "balance sheet", "cash flow", "income statement", "assets", "liabilities"] + ): + logger.debug("Classified %s as 'financial_statement' via page 1 text", file_path.name) + return "financial_statement" + if any(w in first_text_lower for w in ["form ", "w-9", "w-2", "tax return", "filer"]): + logger.debug("Classified %s as 'form' via page 1 text", file_path.name) + return "form" + + # 3. LLM fallback check (if API key is present) + if config.MISTRAL_API_KEY: + try: + client = get_mistral_client() + if client: + prompt = ( + f"Classify the document '{file_path.name}' into one of these types: " + "invoice, financial_statement, contract, form, generic.\n" + "Reply with only the lowercase name of the type.\n" + ) + if first_text: + prompt += f"First page excerpt:\n{first_text[:500]}\n" + + model = config.MISTRAL_DOCUMENT_QNA_MODEL or "ministral-8b-latest" + response = client.chat.complete( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=10, + ) + if response and response.choices: + result = response.choices[0].message.content.strip().lower() + for t in ["invoice", "financial_statement", "contract", "form"]: + if t in result: + logger.debug("Classified %s as '%s' via LLM", file_path.name, t) + return t + except Exception as e: + logger.debug("LLM classification check failed: %s", e) + + logger.debug("Classified %s as 'generic'", file_path.name) + return "generic" + + +def _ocr_shared_optional_params(file_path: Optional[Path] = None, doc_type: str = "auto") -> Dict[str, Any]: """OCR fields shared by sync ``ocr.process`` and batch JSONL ``body``.""" fields: Dict[str, Any] = {} bbox_format = get_bbox_annotation_format() - doc_format = get_document_annotation_format() + + # Resolve dynamic classification if doc_type is "auto" + resolved_doc_type = doc_type + if resolved_doc_type == "auto": + nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE + if nested == "auto": + if file_path is not None: + resolved_doc_type = classify_document_type(file_path) + else: + resolved_doc_type = "generic" + else: + resolved_doc_type = nested + + doc_format = get_document_annotation_format(doc_type=resolved_doc_type) if bbox_format is not None: fields["bbox_annotation_format"] = bbox_format if doc_format is not None: @@ -982,6 +1088,8 @@ def build_ocr_process_kwargs( include_retries: bool, pages: Optional[List[int]] = None, request_id: Optional[str] = None, + file_path: Optional[Path] = None, + doc_type: str = "auto", ) -> Dict[str, Any]: """Build kwargs for ``client.ocr.process`` or a batch JSONL line ``body``.""" ocr_params: Dict[str, Any] = { @@ -995,7 +1103,7 @@ def build_ocr_process_kwargs( ocr_params["id"] = request_id if pages is not None: ocr_params["pages"] = pages - ocr_params.update(_ocr_shared_optional_params()) + ocr_params.update(_ocr_shared_optional_params(file_path=file_path, doc_type=doc_type)) return ocr_params @@ -1130,6 +1238,7 @@ def _report_progress(message: str, progress: float = 0.0): include_retries=True, pages=pages, request_id=ocr_id, + file_path=file_path, ) response = client.ocr.process(**ocr_params) @@ -2521,6 +2630,7 @@ def _prepare_batch_entries( include_retries=False, pages=None, request_id=custom_id, + file_path=file_path, ) body["include_image_base64"] = include_image_base64 diff --git a/tests/test_local_converter.py b/tests/test_local_converter.py index 9bde21c..efb331a 100644 --- a/tests/test_local_converter.py +++ b/tests/test_local_converter.py @@ -277,6 +277,26 @@ def test_cached_on_second_call(self): second = local_converter.get_markitdown_instance() assert first is second + def test_thread_local_isolation(self): + local_converter.reset_markitdown_instance() + import threading + + instances = [] + + def worker(): + inst = local_converter.get_markitdown_instance() + instances.append(inst) + + t1 = threading.Thread(target=worker) + t2 = threading.Thread(target=worker) + t1.start() + t2.start() + t1.join() + t2.join() + + if instances[0] is not None and instances[1] is not None: + assert instances[0] is not instances[1] + # ============================================================================ # convert_with_markitdown Tests (mocked) @@ -582,6 +602,7 @@ def test_not_installed(self, tmp_path): assert "not installed" in error.lower() or "not available" in error.lower() def test_successful_conversion(self, tmp_path, monkeypatch): + output_dir = tmp_path / "test_pages" monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path) monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200) monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "png") @@ -592,15 +613,27 @@ def test_successful_conversion(self, tmp_path, monkeypatch): pdf_file = tmp_path / "test.pdf" pdf_file.write_bytes(b"%PDF-1.4") - # Create mock PIL Image objects - mock_img1 = MagicMock() - mock_img2 = MagicMock() + from pathlib import Path - with patch.object(local_converter, "convert_from_path", return_value=[mock_img1, mock_img2]): - success, paths, error = local_converter.convert_pdf_to_images(pdf_file) + output_dir.mkdir(parents=True, exist_ok=True) + temp_file1 = str(output_dir / "page-1.png") + temp_file2 = str(output_dir / "page-2.png") + Path(temp_file1).touch() + Path(temp_file2).touch() + + with patch.object(local_converter, "convert_from_path", return_value=[temp_file1, temp_file2]) as mock_convert: + success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir) assert success is True assert len(paths) == 2 + assert paths[0] == output_dir / "page_001.png" + assert paths[1] == output_dir / "page_002.png" + assert (output_dir / "page_001.png").exists() + assert (output_dir / "page_002.png").exists() + + mock_convert.assert_called_once() + kwargs = mock_convert.call_args[1] + assert kwargs["paths_only"] is True def test_handles_conversion_error(self, tmp_path, monkeypatch): monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path) @@ -1113,6 +1146,7 @@ class TestConvertPdfToImagesPng: """Line 688: PNG format save branch.""" def test_png_format(self, tmp_path, monkeypatch): + output_dir = tmp_path / "png_pages" monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path) monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200) monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "png") @@ -1123,19 +1157,25 @@ def test_png_format(self, tmp_path, monkeypatch): pdf_file = tmp_path / "test.pdf" pdf_file.write_bytes(b"%PDF-1.4") - mock_img = MagicMock() + from pathlib import Path - with patch.object(local_converter, "convert_from_path", return_value=[mock_img]): - success, paths, error = local_converter.convert_pdf_to_images(pdf_file) + output_dir.mkdir(parents=True, exist_ok=True) + temp_file = str(output_dir / "page-1.png") + Path(temp_file).touch() + + with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert: + success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir) assert success is True - # Verify PNG save was called - mock_img.save.assert_called_once() - call_args = mock_img.save.call_args - assert "PNG" in call_args[0] + assert paths[0] == output_dir / "page_001.png" + mock_convert.assert_called_once() + kwargs = mock_convert.call_args[1] + assert kwargs["paths_only"] is True + assert kwargs["fmt"] == "png" def test_jpeg_format(self, tmp_path, monkeypatch): """Verify JPEG format branch for completeness.""" + output_dir = tmp_path / "jpeg_pages" monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path) monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200) monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "jpeg") @@ -1146,14 +1186,21 @@ def test_jpeg_format(self, tmp_path, monkeypatch): pdf_file = tmp_path / "test.pdf" pdf_file.write_bytes(b"%PDF-1.4") - mock_img = MagicMock() + from pathlib import Path - with patch.object(local_converter, "convert_from_path", return_value=[mock_img]): - success, paths, error = local_converter.convert_pdf_to_images(pdf_file) + output_dir.mkdir(parents=True, exist_ok=True) + temp_file = str(output_dir / "page-1.jpg") + Path(temp_file).touch() + + with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert: + success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir) assert success is True - call_args = mock_img.save.call_args - assert "JPEG" in call_args[0] + assert paths[0] == output_dir / "page_001.jpg" + mock_convert.assert_called_once() + kwargs = mock_convert.call_args[1] + assert kwargs["paths_only"] is True + assert kwargs["fmt"] == "jpeg" # ============================================================================ @@ -1357,6 +1404,7 @@ class TestConvertPdfToImagesOtherFormat: def test_tiff_format(self, tmp_path, monkeypatch): """Other format branch uses PDF_IMAGE_FORMAT.upper().""" + output_dir = tmp_path / "tiff_pages" monkeypatch.setattr(config, "OUTPUT_IMAGES_DIR", tmp_path) monkeypatch.setattr(config, "PDF_IMAGE_DPI", 200) monkeypatch.setattr(config, "PDF_IMAGE_FORMAT", "tiff") @@ -1367,14 +1415,21 @@ def test_tiff_format(self, tmp_path, monkeypatch): pdf_file = tmp_path / "test.pdf" pdf_file.write_bytes(b"%PDF-1.4") - mock_img = MagicMock() + from pathlib import Path - with patch.object(local_converter, "convert_from_path", return_value=[mock_img]): - success, paths, error = local_converter.convert_pdf_to_images(pdf_file) + output_dir.mkdir(parents=True, exist_ok=True) + temp_file = str(output_dir / "page-1.tiff") + Path(temp_file).touch() + + with patch.object(local_converter, "convert_from_path", return_value=[temp_file]) as mock_convert: + success, paths, error = local_converter.convert_pdf_to_images(pdf_file, output_dir=output_dir) assert success is True - call_args = mock_img.save.call_args - assert "TIFF" in call_args[0] + assert paths[0] == output_dir / "page_001.tiff" + mock_convert.assert_called_once() + kwargs = mock_convert.call_args[1] + assert kwargs["paths_only"] is True + assert kwargs["fmt"] == "tiff" if __name__ == "__main__": diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py index 55c21cd..5352050 100644 --- a/tests/test_mistral_converter.py +++ b/tests/test_mistral_converter.py @@ -4288,3 +4288,50 @@ def test_mix_of_weak_and_strong(self, monkeypatch): assert 0 in weak # "short" is weak assert 2 in weak # empty is weak assert 3 not in weak # long diverse text is not weak + + +# ============================================================================ +# document classification routing tests +# ============================================================================ + + +class TestDocumentClassificationRouting: + """Test classify_document_type and dynamic classification schema routing.""" + + def test_classify_by_filename(self, tmp_path): + f1 = tmp_path / "invoice_123.pdf" + assert mistral_converter.classify_document_type(f1) == "invoice" + + f2 = tmp_path / "lease_agreement.docx" + assert mistral_converter.classify_document_type(f2) == "contract" + + f3 = tmp_path / "balance_sheet_2025.xlsx" + assert mistral_converter.classify_document_type(f3) == "financial_statement" + + f4 = tmp_path / "w9_form.pdf" + assert mistral_converter.classify_document_type(f4) == "form" + + f5 = tmp_path / "notes.txt" + assert mistral_converter.classify_document_type(f5) == "generic" + + def test_classify_by_text_content(self, tmp_path): + # Text file classification + f = tmp_path / "doc.txt" + f.write_text("This agreement is entered into by the parties undersigned...") + assert mistral_converter.classify_document_type(f) == "contract" + + f2 = tmp_path / "doc2.txt" + f2.write_text("Total Amount Due: $1,200.50 Invoice Number: INV-091") + assert mistral_converter.classify_document_type(f2) == "invoice" + + def test_shared_optional_params_routing(self, monkeypatch): + # When MISTRAL_DOCUMENT_SCHEMA_TYPE is "auto", classification is triggered + monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto") + monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", True) + + # Mock classify_document_type to return "invoice" + with patch.object(mistral_converter, "classify_document_type", return_value="invoice") as mock_classify: + with patch.object(mistral_converter, "get_document_annotation_format") as mock_fmt: + mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf")) + mock_classify.assert_called_once_with(Path("my_file.pdf")) + mock_fmt.assert_called_once_with(doc_type="invoice") From f6e00c3df5744283727e5abad6f9f62e9214a204 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 19 May 2026 19:37:57 +0000 Subject: [PATCH 2/4] Fix document classification cache invalidation bugs --- local_converter.py | 23 ++++++++++++++--- mistral_converter.py | 13 +++++++--- tests/test_local_converter.py | 45 ++++++++++++++++++++++++++++++--- tests/test_mistral_converter.py | 8 ++++++ 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/local_converter.py b/local_converter.py index c495190..876f266 100644 --- a/local_converter.py +++ b/local_converter.py @@ -70,6 +70,7 @@ _MARKITDOWN_UNSET = object() # sentinel: init never attempted _markitdown_instance = _MARKITDOWN_UNSET _markitdown_instances = threading.local() +_markitdown_generation = 0 _markitdown_lock = threading.Lock() @@ -117,32 +118,48 @@ def get_markitdown_instance() -> Optional[MarkItDown]: """ global _markitdown_instance - if not hasattr(_markitdown_instances, "instance"): + if ( + not hasattr(_markitdown_instances, "instance") + or getattr(_markitdown_instances, "generation", None) != _markitdown_generation + ): with _markitdown_lock: + current_generation = _markitdown_generation + if ( + hasattr(_markitdown_instances, "instance") + and getattr(_markitdown_instances, "generation", None) == current_generation + ): + return _markitdown_instances.instance + if MarkItDown is None: logger.error("MarkItDown not installed. Install with: pip install markitdown") _markitdown_instances.instance = None + _markitdown_instances.generation = current_generation _markitdown_instance = None return None try: instance = MarkItDown(**_build_markitdown_kwargs()) _markitdown_instances.instance = instance + _markitdown_instances.generation = current_generation _markitdown_instance = instance except Exception as e: logger.error("Error initializing MarkItDown: %s", e) _markitdown_instances.instance = None + _markitdown_instances.generation = current_generation _markitdown_instance = None return _markitdown_instances.instance -def reset_markitdown_instance(): +def reset_markitdown_instance() -> None: """Reset the cached MarkItDown instance so the next call retries initialization.""" - global _markitdown_instance + global _markitdown_generation, _markitdown_instance with _markitdown_lock: if hasattr(_markitdown_instances, "instance"): delattr(_markitdown_instances, "instance") + if hasattr(_markitdown_instances, "generation"): + delattr(_markitdown_instances, "generation") + _markitdown_generation += 1 _markitdown_instance = _MARKITDOWN_UNSET diff --git a/mistral_converter.py b/mistral_converter.py index 6466520..c0c2fcf 100644 --- a/mistral_converter.py +++ b/mistral_converter.py @@ -953,6 +953,11 @@ def _cleanup_temp_files(temp_files: List[Path]) -> None: logger.warning("Could not delete temporary file %s: %s", temp_file.name, e) +def _filename_has_keyword(name: str, keywords: List[str]) -> bool: + """Return True when a keyword appears as a filename token.""" + return any(re.search(rf"(? str: """Classify document type into generic, invoice, financial_statement, contract, or form. @@ -964,16 +969,16 @@ def classify_document_type(file_path: Path) -> str: name = file_path.name.lower() # 1. Filename heuristic - if any(w in name for w in ["invoice", "receipt", "bill"]): + if _filename_has_keyword(name, ["invoice", "receipt", "bill"]): logger.debug("Classified %s as 'invoice' via filename", file_path.name) return "invoice" - if any(w in name for w in ["contract", "agreement", "nda", "lease"]): + if _filename_has_keyword(name, ["contract", "agreement", "nda", "lease"]): logger.debug("Classified %s as 'contract' via filename", file_path.name) return "contract" - if any(w in name for w in ["statement", "financial", "balance_sheet", "income"]): + if _filename_has_keyword(name, ["statement", "financial", "balance_sheet", "income"]): logger.debug("Classified %s as 'financial_statement' via filename", file_path.name) return "financial_statement" - if any(w in name for w in ["form", "w9", "w2", "tax"]): + if _filename_has_keyword(name, ["form", "w9", "w2", "tax"]): logger.debug("Classified %s as 'form' via filename", file_path.name) return "form" diff --git a/tests/test_local_converter.py b/tests/test_local_converter.py index efb331a..046b075 100644 --- a/tests/test_local_converter.py +++ b/tests/test_local_converter.py @@ -19,7 +19,9 @@ """ import io +import queue import sys +import threading from unittest.mock import MagicMock, patch import pytest @@ -279,7 +281,6 @@ def test_cached_on_second_call(self): def test_thread_local_isolation(self): local_converter.reset_markitdown_instance() - import threading instances = [] @@ -297,6 +298,46 @@ def worker(): if instances[0] is not None and instances[1] is not None: assert instances[0] is not instances[1] + def test_reset_invalidates_other_thread_cache(self): + local_converter.reset_markitdown_instance() + + class FakeMarkItDown: + created = 0 + + def __init__(self, **kwargs): + type(self).created += 1 + self.created = type(self).created + + commands = queue.Queue() + responses = queue.Queue() + + def worker(): + while True: + command = commands.get(timeout=5) + if command == "stop": + return + responses.put(local_converter.get_markitdown_instance()) + + with patch.object(local_converter, "MarkItDown", FakeMarkItDown): + worker_thread = threading.Thread(target=worker) + worker_thread.start() + try: + commands.put("get") + first = responses.get(timeout=5) + + local_converter.reset_markitdown_instance() + + commands.put("get") + second = responses.get(timeout=5) + finally: + commands.put("stop") + worker_thread.join(timeout=5) + local_converter.reset_markitdown_instance() + + assert first is not second + assert first.created == 1 + assert second.created == 2 + # ============================================================================ # convert_with_markitdown Tests (mocked) @@ -889,8 +930,6 @@ def test_init_exception(self, monkeypatch): def test_concurrent_lock_returns_cached(self): """Line 95: second call inside lock finds instance already set by first call.""" - import threading - local_converter.reset_markitdown_instance() results = [None, None] diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py index 5352050..c5242a7 100644 --- a/tests/test_mistral_converter.py +++ b/tests/test_mistral_converter.py @@ -4314,6 +4314,14 @@ def test_classify_by_filename(self, tmp_path): f5 = tmp_path / "notes.txt" assert mistral_converter.classify_document_type(f5) == "generic" + def test_classify_by_filename_uses_token_boundaries(self, tmp_path, monkeypatch): + monkeypatch.setattr(config, "MISTRAL_API_KEY", "") + + assert mistral_converter.classify_document_type(tmp_path / "information.pdf") == "generic" + assert mistral_converter.classify_document_type(tmp_path / "monday_notes.pdf") == "generic" + assert mistral_converter.classify_document_type(tmp_path / "taxonomy.pdf") == "generic" + assert mistral_converter.classify_document_type(tmp_path / "billboard.pdf") == "generic" + def test_classify_by_text_content(self, tmp_path): # Text file classification f = tmp_path / "doc.txt" From 8fea2ddfbff4ee50330f508dbec59cced09be2e7 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 19 May 2026 20:02:52 +0000 Subject: [PATCH 3/4] Remove thread-local MarkItDown double-check --- local_converter.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/local_converter.py b/local_converter.py index 876f266..96bca15 100644 --- a/local_converter.py +++ b/local_converter.py @@ -124,11 +124,6 @@ def get_markitdown_instance() -> Optional[MarkItDown]: ): with _markitdown_lock: current_generation = _markitdown_generation - if ( - hasattr(_markitdown_instances, "instance") - and getattr(_markitdown_instances, "generation", None) == current_generation - ): - return _markitdown_instances.instance if MarkItDown is None: logger.error("MarkItDown not installed. Install with: pip install markitdown") From 4293a6ea3272faa3f86b5a4f6c14001811e0b2a4 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 19 May 2026 20:18:24 +0000 Subject: [PATCH 4/4] Avoid unused document type classification --- mistral_converter.py | 24 +++++++++++++----------- tests/test_mistral_converter.py | 12 ++++++++++++ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/mistral_converter.py b/mistral_converter.py index c0c2fcf..db8081e 100644 --- a/mistral_converter.py +++ b/mistral_converter.py @@ -1056,19 +1056,21 @@ def _ocr_shared_optional_params(file_path: Optional[Path] = None, doc_type: str fields: Dict[str, Any] = {} bbox_format = get_bbox_annotation_format() - # Resolve dynamic classification if doc_type is "auto" - resolved_doc_type = doc_type - if resolved_doc_type == "auto": - nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE - if nested == "auto": - if file_path is not None: - resolved_doc_type = classify_document_type(file_path) + doc_format = None + if config.MISTRAL_ENABLE_STRUCTURED_OUTPUT and config.MISTRAL_ENABLE_DOCUMENT_ANNOTATION: + # Resolve dynamic classification only when document annotation can use it. + resolved_doc_type = doc_type + if resolved_doc_type == "auto": + nested = config.MISTRAL_DOCUMENT_SCHEMA_TYPE + if nested == "auto": + if file_path is not None: + resolved_doc_type = classify_document_type(file_path) + else: + resolved_doc_type = "generic" else: - resolved_doc_type = "generic" - else: - resolved_doc_type = nested + resolved_doc_type = nested - doc_format = get_document_annotation_format(doc_type=resolved_doc_type) + doc_format = get_document_annotation_format(doc_type=resolved_doc_type) if bbox_format is not None: fields["bbox_annotation_format"] = bbox_format if doc_format is not None: diff --git a/tests/test_mistral_converter.py b/tests/test_mistral_converter.py index c5242a7..bb7e299 100644 --- a/tests/test_mistral_converter.py +++ b/tests/test_mistral_converter.py @@ -4335,6 +4335,7 @@ def test_classify_by_text_content(self, tmp_path): def test_shared_optional_params_routing(self, monkeypatch): # When MISTRAL_DOCUMENT_SCHEMA_TYPE is "auto", classification is triggered monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto") + monkeypatch.setattr(config, "MISTRAL_ENABLE_STRUCTURED_OUTPUT", True) monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", True) # Mock classify_document_type to return "invoice" @@ -4343,3 +4344,14 @@ def test_shared_optional_params_routing(self, monkeypatch): mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf")) mock_classify.assert_called_once_with(Path("my_file.pdf")) mock_fmt.assert_called_once_with(doc_type="invoice") + + def test_shared_optional_params_skips_classification_when_document_annotation_disabled(self, monkeypatch): + monkeypatch.setattr(config, "MISTRAL_DOCUMENT_SCHEMA_TYPE", "auto") + monkeypatch.setattr(config, "MISTRAL_ENABLE_STRUCTURED_OUTPUT", True) + monkeypatch.setattr(config, "MISTRAL_ENABLE_DOCUMENT_ANNOTATION", False) + + with patch.object(mistral_converter, "classify_document_type", return_value="invoice") as mock_classify: + with patch.object(mistral_converter, "get_document_annotation_format") as mock_fmt: + mistral_converter._ocr_shared_optional_params(file_path=Path("my_file.pdf")) + mock_classify.assert_not_called() + mock_fmt.assert_not_called()