From 6702ca46b04f5c4cd6e809e4a948f38918ee71b2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:18:16 +0000 Subject: [PATCH 1/3] Initial plan From e439182a755beec559db095a90eff67583d63719 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:25:58 +0000 Subject: [PATCH 2/3] Add OCR verification to check for text extraction after OCR completion Co-authored-by: maxi07 <7480270+maxi07@users.noreply.github.com> --- ocr_service/main.py | 20 +++++++++-- tests/test_ocr_verification.py | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 tests/test_ocr_verification.py diff --git a/ocr_service/main.py b/ocr_service/main.py index 7268564..8f452bc 100644 --- a/ocr_service/main.py +++ b/ocr_service/main.py @@ -1,9 +1,10 @@ from scansynclib.logging import logger from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus from scansynclib.sqlite_wrapper import update_scanneddata_database -from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq +from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text import pickle import ocrmypdf +import os from datetime import datetime import time import pika.exceptions @@ -36,13 +37,26 @@ def start_processing(item: ProcessItem): try: result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120) + logger.debug(f"OCR exited with code {result}") + if result != 0: logger.error(f"OCR exited with code {result}") item.ocr_status = OCRStatus.FAILED else: logger.info(f"OCR processing completed: {item.filename}") - logger.debug(f"OCR exited with code {result}") - item.ocr_status = OCRStatus.COMPLETED + + # Verify that the OCR file actually contains text + if os.path.exists(item.ocr_file): + extracted_text = extract_text(item.ocr_file).strip() + if extracted_text: + logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}") + item.ocr_status = OCRStatus.COMPLETED + else: + logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}") + item.ocr_status = OCRStatus.FAILED + else: + logger.error(f"OCR output file not found: {item.ocr_file}") + item.ocr_status = OCRStatus.OUTPUT_ERROR except ocrmypdf.UnsupportedImageFormatError: logger.error(f"Unsupported image format: {item.local_file_path}") item.ocr_status = OCRStatus.UNSUPPORTED diff --git a/tests/test_ocr_verification.py b/tests/test_ocr_verification.py new file mode 100644 index 0000000..9f1c12d --- /dev/null +++ b/tests/test_ocr_verification.py @@ -0,0 +1,62 @@ +import pytest +import os +import tempfile +from unittest.mock import Mock, patch, mock_open +from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus + + +class TestOCRTextVerification: + """Test OCR text verification functionality without importing the main OCR service.""" + + def test_extract_text_returns_empty_string_on_empty_pdf(self): + """Test that extract_text returns empty string for a PDF with no text.""" + from scansynclib.helpers import extract_text + + # Create a temporary file that simulates an empty PDF + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + temp_file.write(b"%PDF-1.4\n") # Minimal PDF header + temp_file_path = temp_file.name + + try: + # extract_text should return empty string for malformed/empty PDF + result = extract_text(temp_file_path) + assert result == "" + finally: + os.unlink(temp_file_path) + + def test_extract_text_returns_empty_string_on_nonexistent_file(self): + """Test that extract_text returns empty string for non-existent file.""" + from scansynclib.helpers import extract_text + + result = extract_text("/nonexistent/file.pdf") + assert result == "" + + @patch('scansynclib.helpers.PdfReader') + def test_extract_text_strips_whitespace(self, mock_pdf_reader): + """Test that extract_text properly handles text with whitespace.""" + from scansynclib.helpers import extract_text + + # Mock the PDF reader to return text with whitespace + mock_page = Mock() + mock_page.extract_text.return_value = " \n\t Some text \n\t " + mock_reader = Mock() + mock_reader.pages = [mock_page] + mock_pdf_reader.return_value = mock_reader + + result = extract_text("dummy_path.pdf") + assert result == " \n\t Some text \n\t " # Should return raw text, not stripped + + def test_process_item_has_ocr_file_attribute(self): + """Test that ProcessItem correctly sets the OCR file path.""" + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: + temp_file_path = temp_file.name + + try: + item = ProcessItem(temp_file_path, ItemType.PDF) + + # Verify OCR file path is set correctly + assert hasattr(item, 'ocr_file') + assert item.ocr_file.endswith('_OCR.pdf') + assert item.ocr_status == OCRStatus.UNKNOWN + finally: + os.unlink(temp_file_path) \ No newline at end of file From 7aa489c2a48db1a10b91fada2e55cc28033d245c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:26:24 +0000 Subject: [PATCH 3/3] Add OCR status persistence, UI display, and comprehensive start_processing tests Co-authored-by: maxi07 <7480270+maxi07@users.noreply.github.com> Agent-Logs-Url: https://github.com/maxi07/ScanSync/sessions/a2c6fc77-605f-48ae-b6b2-579df2e70e6c --- ocr_service/main.py | 5 +- scansynclib/scansynclib/sqlite_wrapper.py | 5 + tests/test_ocr_verification.py | 273 +++++++++++++++++++++- web_service/src/main.py | 1 + web_service/src/static/js/dashboard.js | 52 +++++ 5 files changed, 324 insertions(+), 12 deletions(-) diff --git a/ocr_service/main.py b/ocr_service/main.py index 8f452bc..aff3aa9 100644 --- a/ocr_service/main.py +++ b/ocr_service/main.py @@ -95,7 +95,7 @@ def start_processing(item: ProcessItem): logger.error(f"Failed to forward item {item.filename} to the next service: {e}") item.status = ProcessStatus.FAILED finally: - update_scanneddata_database(item, {"file_status": item.status.value}) + update_scanneddata_database(item, {"file_status": item.status.value, "ocr_status": item.ocr_status.name}) return item @@ -115,4 +115,5 @@ def start_consuming_with_reconnect(): # Start the consumer with reconnect logic -start_consuming_with_reconnect() +if __name__ == "__main__": + start_consuming_with_reconnect() diff --git a/scansynclib/scansynclib/sqlite_wrapper.py b/scansynclib/scansynclib/sqlite_wrapper.py index c6576f0..c05dcf9 100644 --- a/scansynclib/scansynclib/sqlite_wrapper.py +++ b/scansynclib/scansynclib/sqlite_wrapper.py @@ -145,6 +145,11 @@ def upgrade_sql_database(): logger.info("Migration: Adding 'additional_smb' column to scanneddata table") cursor.execute("ALTER TABLE scanneddata ADD COLUMN additional_smb TEXT") conn.commit() + + if "ocr_status" not in columns: + logger.info("Migration: Adding 'ocr_status' column to scanneddata table") + cursor.execute("ALTER TABLE scanneddata ADD COLUMN ocr_status TEXT") + conn.commit() except sqlite3.OperationalError as e: if "no such table: scanneddata" in str(e): logger.error("Database schema is missing. Please ensure the schema.sql file is present.") diff --git a/tests/test_ocr_verification.py b/tests/test_ocr_verification.py index 9f1c12d..a09faa3 100644 --- a/tests/test_ocr_verification.py +++ b/tests/test_ocr_verification.py @@ -1,8 +1,9 @@ import pytest import os +import sys import tempfile -from unittest.mock import Mock, patch, mock_open -from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus +from unittest.mock import Mock, patch, MagicMock +from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus, ProcessStatus class TestOCRTextVerification: @@ -11,12 +12,12 @@ class TestOCRTextVerification: def test_extract_text_returns_empty_string_on_empty_pdf(self): """Test that extract_text returns empty string for a PDF with no text.""" from scansynclib.helpers import extract_text - + # Create a temporary file that simulates an empty PDF with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: temp_file.write(b"%PDF-1.4\n") # Minimal PDF header temp_file_path = temp_file.name - + try: # extract_text should return empty string for malformed/empty PDF result = extract_text(temp_file_path) @@ -27,7 +28,7 @@ def test_extract_text_returns_empty_string_on_empty_pdf(self): def test_extract_text_returns_empty_string_on_nonexistent_file(self): """Test that extract_text returns empty string for non-existent file.""" from scansynclib.helpers import extract_text - + result = extract_text("/nonexistent/file.pdf") assert result == "" @@ -35,14 +36,14 @@ def test_extract_text_returns_empty_string_on_nonexistent_file(self): def test_extract_text_strips_whitespace(self, mock_pdf_reader): """Test that extract_text properly handles text with whitespace.""" from scansynclib.helpers import extract_text - + # Mock the PDF reader to return text with whitespace mock_page = Mock() mock_page.extract_text.return_value = " \n\t Some text \n\t " mock_reader = Mock() mock_reader.pages = [mock_page] mock_pdf_reader.return_value = mock_reader - + result = extract_text("dummy_path.pdf") assert result == " \n\t Some text \n\t " # Should return raw text, not stripped @@ -50,13 +51,265 @@ def test_process_item_has_ocr_file_attribute(self): """Test that ProcessItem correctly sets the OCR file path.""" with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: temp_file_path = temp_file.name - + try: item = ProcessItem(temp_file_path, ItemType.PDF) - + # Verify OCR file path is set correctly assert hasattr(item, 'ocr_file') assert item.ocr_file.endswith('_OCR.pdf') assert item.ocr_status == OCRStatus.UNKNOWN finally: - os.unlink(temp_file_path) \ No newline at end of file + os.unlink(temp_file_path) + + +def _load_ocr_main(): + """ + Load ocr_service.main with all heavy external dependencies mocked so it + can be imported in a unit-test environment (no RabbitMQ, no Redis, no DB). + Returns (module, mock_ocrmypdf) so callers can reuse the exception classes. + """ + # Build lightweight exception classes that match the real ones structurally + mock_ocrmypdf = MagicMock() + mock_ocrmypdf.UnsupportedImageFormatError = type( + 'UnsupportedImageFormatError', (Exception,), {}) + mock_ocrmypdf.DpiError = type('DpiError', (Exception,), {}) + mock_ocrmypdf.InputFileError = type('InputFileError', (Exception,), {}) + mock_ocrmypdf.OutputFileAccessError = type('OutputFileAccessError', (Exception,), {}) + mock_ocrmypdf.MissingDependencyError = type('MissingDependencyError', (Exception,), {}) + + mock_pika = MagicMock() + mock_pika.exceptions = MagicMock() + mock_pika.exceptions.AMQPConnectionError = Exception + + mock_settings_mod = MagicMock() + mock_settings_mod.settings = MagicMock() + mock_settings_mod.settings.file_naming = MagicMock() + mock_settings_mod.settings.file_naming.ollama_server_url = None + mock_settings_mod.settings.file_naming.ollama_server_port = None + mock_settings_mod.settings.file_naming.ollama_model = None + mock_settings_mod.settings.file_naming.openai_api_key = None + + module_patches = { + 'ocrmypdf': mock_ocrmypdf, + 'pika': mock_pika, + 'scansynclib.settings': mock_settings_mod, + 'scansynclib.sqlite_wrapper': MagicMock(), + } + + # Evict any cached copies so the module is re-executed with our mocks + for key in list(sys.modules.keys()): + if key in ('ocr_service.main', 'ocr_service'): + del sys.modules[key] + + with patch.dict('sys.modules', module_patches): + import ocr_service.main as ocr_main # noqa: PLC0415 + return ocr_main, mock_ocrmypdf + + +class TestStartProcessing: + """Test the start_processing function from the OCR service.""" + + def _create_mock_item(self): + """Create a mock ProcessItem for testing.""" + item = Mock(spec=ProcessItem) + item.filename = "test.pdf" + item.local_file_path = "/tmp/test.pdf" + item.ocr_file = "/tmp/test_OCR.pdf" + item.ocr_status = OCRStatus.UNKNOWN + item.status = ProcessStatus.OCR_PENDING + item.db_id = 1 + item.time_ocr_started = None + item.time_ocr_finished = None + return item + + # ------------------------------------------------------------------ + # Helpers to reduce boilerplate in each test + # ------------------------------------------------------------------ + def _setup_ocr_mod(self, mock_ocr_mod, mock_ocrmypdf): + """Attach the fake exception classes to the mock ocrmypdf module.""" + mock_ocr_mod.UnsupportedImageFormatError = mock_ocrmypdf.UnsupportedImageFormatError + mock_ocr_mod.DpiError = mock_ocrmypdf.DpiError + mock_ocr_mod.InputFileError = mock_ocrmypdf.InputFileError + mock_ocr_mod.OutputFileAccessError = mock_ocrmypdf.OutputFileAccessError + mock_ocr_mod.MissingDependencyError = mock_ocrmypdf.MissingDependencyError + + # ------------------------------------------------------------------ + # Tests + # ------------------------------------------------------------------ + + def test_ocr_success_with_text_sets_completed(self): + """When OCR succeeds and text is found, ocr_status should be COMPLETED.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \ + patch.object(ocr_main, 'extract_text', return_value="Real OCR text."), \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = True + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.COMPLETED + final_call = mock_update_db.call_args_list[-1][0][1] + assert final_call.get("ocr_status") == OCRStatus.COMPLETED.name + + def test_ocr_success_no_text_sets_failed(self): + """When OCR succeeds but no text found, ocr_status should be FAILED.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \ + patch.object(ocr_main, 'extract_text', return_value=""), \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = True + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.FAILED + final_call = mock_update_db.call_args_list[-1][0][1] + assert final_call.get("ocr_status") == OCRStatus.FAILED.name + + def test_ocr_success_whitespace_only_sets_failed(self): + """When OCR succeeds but only whitespace found, ocr_status should be FAILED.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database'), \ + patch.object(ocr_main, 'extract_text', return_value=" \n\t \n "), \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = True + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.FAILED + + def test_ocr_success_missing_output_file_sets_output_error(self): + """When OCR succeeds but output file is missing, status should be OUTPUT_ERROR.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = False # output file missing + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.OUTPUT_ERROR + final_call = mock_update_db.call_args_list[-1][0][1] + assert final_call.get("ocr_status") == OCRStatus.OUTPUT_ERROR.name + + def test_ocr_nonzero_exit_code_sets_failed(self): + """When OCR exits with non-zero code, ocr_status should be FAILED.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database'), \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 1 + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.FAILED + + def test_ocr_unsupported_format_sets_unsupported(self): + """When OCR raises UnsupportedImageFormatError, status should be UNSUPPORTED.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database'), \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + UnsupportedError = mock_ocrmypdf.UnsupportedImageFormatError + mock_ocr_mod.UnsupportedImageFormatError = UnsupportedError + mock_ocr_mod.ocr.side_effect = UnsupportedError("Unsupported format") + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.UNSUPPORTED + + def test_ocr_dpi_error_sets_dpi_error(self): + """When OCR raises DpiError, ocr_status should be DPI_ERROR.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database'), \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + DpiError = mock_ocrmypdf.DpiError + mock_ocr_mod.DpiError = DpiError + mock_ocr_mod.ocr.side_effect = DpiError("DPI too low") + + item = self._create_mock_item() + ocr_main.start_processing(item) + + assert item.ocr_status == OCRStatus.DPI_ERROR + + def test_ocr_success_forwards_to_upload_queue(self): + """When OCR succeeds with text, item should be forwarded to upload queue.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database'), \ + patch.object(ocr_main, 'extract_text', return_value="Real text."), \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq') as mock_forward: + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = True + + item = self._create_mock_item() + ocr_main.start_processing(item) + + mock_forward.assert_called_once_with("upload_queue", item) + + def test_ocr_db_updated_with_ocr_status(self): + """Verify the DB final update includes both file_status and ocr_status.""" + ocr_main, mock_ocrmypdf = _load_ocr_main() + + with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \ + patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \ + patch.object(ocr_main, 'extract_text', return_value="Extracted text."), \ + patch.object(ocr_main, 'os') as mock_os, \ + patch.object(ocr_main, 'forward_to_rabbitmq'): + + self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf) + mock_ocr_mod.ocr.return_value = 0 + mock_os.path.exists.return_value = True + + item = self._create_mock_item() + ocr_main.start_processing(item) + + # At least two DB calls: initial status update + final status update + assert mock_update_db.call_count >= 2 + final_update = mock_update_db.call_args_list[-1][0][1] + assert "file_status" in final_update + assert "ocr_status" in final_update + assert final_update["ocr_status"] == OCRStatus.COMPLETED.name diff --git a/web_service/src/main.py b/web_service/src/main.py index a274290..507778a 100644 --- a/web_service/src/main.py +++ b/web_service/src/main.py @@ -88,6 +88,7 @@ def callback(ch, method, properties, body): currently_uploading=item.current_uploading, current_upload_target=item.current_upload_target, badges=badges, # Add the generated badges + ocr_status=item.ocr_status.name if item.ocr_status else None, ) payload["dashboard_data"] = get_dashboard_info() # Nur bei Bedarf abrufen sse_queue.put(json.dumps(payload, default=str)) # Ensure all objects are serializable diff --git a/web_service/src/static/js/dashboard.js b/web_service/src/static/js/dashboard.js index 86e3d5f..ff84338 100644 --- a/web_service/src/static/js/dashboard.js +++ b/web_service/src/static/js/dashboard.js @@ -177,6 +177,30 @@ function updateCard(updateData) { console.error(`Error updating file status: ${error.message}`); } + // Update OCR Status + try { + if (updateData.ocr_status !== undefined) { + const ocrStatusSpan = document.getElementById(updateData.id + "_ocr_status"); + if (ocrStatusSpan) { + const ocrStatusText = getOcrStatusText(updateData.ocr_status); + ocrStatusSpan.innerHTML = ''; + if (ocrStatusText) { + const icon = document.createElement('i'); + icon.className = 'bi bi-exclamation-triangle-fill text-warning'; + const small = document.createElement('small'); + small.className = 'text-warning'; + small.textContent = ocrStatusText; + ocrStatusSpan.appendChild(icon); + ocrStatusSpan.appendChild(document.createTextNode(' ')); + ocrStatusSpan.appendChild(small); + ocrStatusSpan.appendChild(document.createElement('br')); + } + } + } + } catch (error) { + console.error(`Error updating OCR status: ${error.message}`); + } + // Update File Name try { if (updateData.file_name && updateData.file_name.trim() !== "") { @@ -446,6 +470,22 @@ function addPdfCard(pdfData) { statusSpan.textContent = pdfData.file_status || "N/A"; statusSpan.innerHTML += brElement; + // Create OCR status element (only shown when OCR failed) + let ocrStatusSpan = document.createElement('span'); + ocrStatusSpan.id = pdfData.id + '_ocr_status'; + const ocrStatusText = getOcrStatusText(pdfData.ocr_status); + if (ocrStatusText) { + const ocrIcon = document.createElement('i'); + ocrIcon.className = 'bi bi-exclamation-triangle-fill text-warning'; + const ocrSmall = document.createElement('small'); + ocrSmall.className = 'text-warning'; + ocrSmall.textContent = ocrStatusText; + ocrStatusSpan.appendChild(ocrIcon); + ocrStatusSpan.appendChild(document.createTextNode(' ')); + ocrStatusSpan.appendChild(ocrSmall); + ocrStatusSpan.appendChild(document.createElement('br')); + } + // Create segmented progress bar container const progressContainer = document.createElement('div'); progressContainer.classList.add('progress-bar-wrapper'); @@ -475,6 +515,7 @@ function addPdfCard(pdfData) { infoParagraph.appendChild(smbContainer); infoParagraph.appendChild(statusText); infoParagraph.appendChild(statusSpan); + infoParagraph.appendChild(ocrStatusSpan); bodyDiv.appendChild(titleElement); bodyDiv.appendChild(infoParagraph); @@ -533,6 +574,17 @@ function getStatusIcon(file_status) { return status_icon; } +function getOcrStatusText(ocr_status) { + const ocrFailureMessages = { + 'FAILED': 'OCR: No text found', + 'UNSUPPORTED': 'OCR: Unsupported format', + 'DPI_ERROR': 'OCR: Image DPI too low', + 'INPUT_ERROR': 'OCR: Input file error', + 'OUTPUT_ERROR': 'OCR: Output file error', + }; + return ocrFailureMessages[ocr_status] || null; +} + function updateProgressBar(pdfId, newStep) { const progressBar = document.getElementById(`${pdfId}_progress_bar`); if (!progressBar) return;