Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions ocr_service/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from scansynclib.logging import logger
from scansynclib.ProcessItem import ProcessItem, ProcessStatus, OCRStatus
from scansynclib.sqlite_wrapper import update_scanneddata_database
from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq
from scansynclib.helpers import connect_rabbitmq, forward_to_rabbitmq, extract_text
import pickle
import ocrmypdf
import os
from datetime import datetime
import time
import pika.exceptions
Expand Down Expand Up @@ -36,13 +37,26 @@ def start_processing(item: ProcessItem):

try:
result = ocrmypdf.ocr(item.local_file_path, item.ocr_file, output_type='pdfa', skip_text=True, rotate_pages=True, jpg_quality=80, png_quality=80, optimize=2, language=["eng", "deu"], tesseract_timeout=120)
logger.debug(f"OCR exited with code {result}")

if result != 0:
logger.error(f"OCR exited with code {result}")
item.ocr_status = OCRStatus.FAILED
else:
logger.info(f"OCR processing completed: {item.filename}")
logger.debug(f"OCR exited with code {result}")
item.ocr_status = OCRStatus.COMPLETED

# Verify that the OCR file actually contains text
if os.path.exists(item.ocr_file):
extracted_text = extract_text(item.ocr_file).strip()
if extracted_text:
logger.info(f"OCR verification successful: extracted {len(extracted_text)} characters from {item.filename}")
item.ocr_status = OCRStatus.COMPLETED
else:
logger.warning(f"OCR verification failed: no text found in OCR output file {item.ocr_file}")
item.ocr_status = OCRStatus.FAILED
else:
logger.error(f"OCR output file not found: {item.ocr_file}")
item.ocr_status = OCRStatus.OUTPUT_ERROR
except ocrmypdf.UnsupportedImageFormatError:
logger.error(f"Unsupported image format: {item.local_file_path}")
item.ocr_status = OCRStatus.UNSUPPORTED
Expand Down Expand Up @@ -81,7 +95,7 @@ def start_processing(item: ProcessItem):
logger.error(f"Failed to forward item {item.filename} to the next service: {e}")
item.status = ProcessStatus.FAILED
finally:
update_scanneddata_database(item, {"file_status": item.status.value})
update_scanneddata_database(item, {"file_status": item.status.value, "ocr_status": item.ocr_status.name})
return item


Expand All @@ -101,4 +115,5 @@ def start_consuming_with_reconnect():


# Start the consumer with reconnect logic
start_consuming_with_reconnect()
if __name__ == "__main__":
start_consuming_with_reconnect()
5 changes: 5 additions & 0 deletions scansynclib/scansynclib/sqlite_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ def upgrade_sql_database():
logger.info("Migration: Adding 'additional_smb' column to scanneddata table")
cursor.execute("ALTER TABLE scanneddata ADD COLUMN additional_smb TEXT")
conn.commit()

if "ocr_status" not in columns:
logger.info("Migration: Adding 'ocr_status' column to scanneddata table")
cursor.execute("ALTER TABLE scanneddata ADD COLUMN ocr_status TEXT")
conn.commit()
except sqlite3.OperationalError as e:
if "no such table: scanneddata" in str(e):
logger.error("Database schema is missing. Please ensure the schema.sql file is present.")
Expand Down
315 changes: 315 additions & 0 deletions tests/test_ocr_verification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
import pytest
import os
import sys
import tempfile
from unittest.mock import Mock, patch, MagicMock
from scansynclib.ProcessItem import ProcessItem, ItemType, OCRStatus, ProcessStatus


class TestOCRTextVerification:
"""Test OCR text verification functionality without importing the main OCR service."""

def test_extract_text_returns_empty_string_on_empty_pdf(self):
"""Test that extract_text returns empty string for a PDF with no text."""
from scansynclib.helpers import extract_text

# Create a temporary file that simulates an empty PDF
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_file.write(b"%PDF-1.4\n") # Minimal PDF header
temp_file_path = temp_file.name

try:
# extract_text should return empty string for malformed/empty PDF
result = extract_text(temp_file_path)
assert result == ""
finally:
os.unlink(temp_file_path)

def test_extract_text_returns_empty_string_on_nonexistent_file(self):
"""Test that extract_text returns empty string for non-existent file."""
from scansynclib.helpers import extract_text

result = extract_text("/nonexistent/file.pdf")
assert result == ""

@patch('scansynclib.helpers.PdfReader')
def test_extract_text_strips_whitespace(self, mock_pdf_reader):
"""Test that extract_text properly handles text with whitespace."""
from scansynclib.helpers import extract_text

# Mock the PDF reader to return text with whitespace
mock_page = Mock()
mock_page.extract_text.return_value = " \n\t Some text \n\t "
mock_reader = Mock()
mock_reader.pages = [mock_page]
mock_pdf_reader.return_value = mock_reader

result = extract_text("dummy_path.pdf")
assert result == " \n\t Some text \n\t " # Should return raw text, not stripped

def test_process_item_has_ocr_file_attribute(self):
"""Test that ProcessItem correctly sets the OCR file path."""
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
temp_file_path = temp_file.name

try:
item = ProcessItem(temp_file_path, ItemType.PDF)

# Verify OCR file path is set correctly
assert hasattr(item, 'ocr_file')
assert item.ocr_file.endswith('_OCR.pdf')
assert item.ocr_status == OCRStatus.UNKNOWN
finally:
os.unlink(temp_file_path)


def _load_ocr_main():
"""
Load ocr_service.main with all heavy external dependencies mocked so it
can be imported in a unit-test environment (no RabbitMQ, no Redis, no DB).
Returns (module, mock_ocrmypdf) so callers can reuse the exception classes.
"""
# Build lightweight exception classes that match the real ones structurally
mock_ocrmypdf = MagicMock()
mock_ocrmypdf.UnsupportedImageFormatError = type(
'UnsupportedImageFormatError', (Exception,), {})
mock_ocrmypdf.DpiError = type('DpiError', (Exception,), {})
mock_ocrmypdf.InputFileError = type('InputFileError', (Exception,), {})
mock_ocrmypdf.OutputFileAccessError = type('OutputFileAccessError', (Exception,), {})
mock_ocrmypdf.MissingDependencyError = type('MissingDependencyError', (Exception,), {})

mock_pika = MagicMock()
mock_pika.exceptions = MagicMock()
mock_pika.exceptions.AMQPConnectionError = Exception

mock_settings_mod = MagicMock()
mock_settings_mod.settings = MagicMock()
mock_settings_mod.settings.file_naming = MagicMock()
mock_settings_mod.settings.file_naming.ollama_server_url = None
mock_settings_mod.settings.file_naming.ollama_server_port = None
mock_settings_mod.settings.file_naming.ollama_model = None
mock_settings_mod.settings.file_naming.openai_api_key = None

module_patches = {
'ocrmypdf': mock_ocrmypdf,
'pika': mock_pika,
'scansynclib.settings': mock_settings_mod,
'scansynclib.sqlite_wrapper': MagicMock(),
}

# Evict any cached copies so the module is re-executed with our mocks
for key in list(sys.modules.keys()):
if key in ('ocr_service.main', 'ocr_service'):
del sys.modules[key]

with patch.dict('sys.modules', module_patches):
import ocr_service.main as ocr_main # noqa: PLC0415
return ocr_main, mock_ocrmypdf


class TestStartProcessing:
"""Test the start_processing function from the OCR service."""

def _create_mock_item(self):
"""Create a mock ProcessItem for testing."""
item = Mock(spec=ProcessItem)
item.filename = "test.pdf"
item.local_file_path = "/tmp/test.pdf"
item.ocr_file = "/tmp/test_OCR.pdf"
item.ocr_status = OCRStatus.UNKNOWN
item.status = ProcessStatus.OCR_PENDING
item.db_id = 1
item.time_ocr_started = None
item.time_ocr_finished = None
return item

# ------------------------------------------------------------------
# Helpers to reduce boilerplate in each test
# ------------------------------------------------------------------
def _setup_ocr_mod(self, mock_ocr_mod, mock_ocrmypdf):
"""Attach the fake exception classes to the mock ocrmypdf module."""
mock_ocr_mod.UnsupportedImageFormatError = mock_ocrmypdf.UnsupportedImageFormatError
mock_ocr_mod.DpiError = mock_ocrmypdf.DpiError
mock_ocr_mod.InputFileError = mock_ocrmypdf.InputFileError
mock_ocr_mod.OutputFileAccessError = mock_ocrmypdf.OutputFileAccessError
mock_ocr_mod.MissingDependencyError = mock_ocrmypdf.MissingDependencyError

# ------------------------------------------------------------------
# Tests
# ------------------------------------------------------------------

def test_ocr_success_with_text_sets_completed(self):
"""When OCR succeeds and text is found, ocr_status should be COMPLETED."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \
patch.object(ocr_main, 'extract_text', return_value="Real OCR text."), \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = True

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.COMPLETED
final_call = mock_update_db.call_args_list[-1][0][1]
assert final_call.get("ocr_status") == OCRStatus.COMPLETED.name

def test_ocr_success_no_text_sets_failed(self):
"""When OCR succeeds but no text found, ocr_status should be FAILED."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \
patch.object(ocr_main, 'extract_text', return_value=""), \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = True

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.FAILED
final_call = mock_update_db.call_args_list[-1][0][1]
assert final_call.get("ocr_status") == OCRStatus.FAILED.name

def test_ocr_success_whitespace_only_sets_failed(self):
"""When OCR succeeds but only whitespace found, ocr_status should be FAILED."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database'), \
patch.object(ocr_main, 'extract_text', return_value=" \n\t \n "), \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = True

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.FAILED

def test_ocr_success_missing_output_file_sets_output_error(self):
"""When OCR succeeds but output file is missing, status should be OUTPUT_ERROR."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = False # output file missing

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.OUTPUT_ERROR
final_call = mock_update_db.call_args_list[-1][0][1]
assert final_call.get("ocr_status") == OCRStatus.OUTPUT_ERROR.name

def test_ocr_nonzero_exit_code_sets_failed(self):
"""When OCR exits with non-zero code, ocr_status should be FAILED."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database'), \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 1

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.FAILED

def test_ocr_unsupported_format_sets_unsupported(self):
"""When OCR raises UnsupportedImageFormatError, status should be UNSUPPORTED."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database'), \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
UnsupportedError = mock_ocrmypdf.UnsupportedImageFormatError
mock_ocr_mod.UnsupportedImageFormatError = UnsupportedError
mock_ocr_mod.ocr.side_effect = UnsupportedError("Unsupported format")

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.UNSUPPORTED

def test_ocr_dpi_error_sets_dpi_error(self):
"""When OCR raises DpiError, ocr_status should be DPI_ERROR."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database'), \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
DpiError = mock_ocrmypdf.DpiError
mock_ocr_mod.DpiError = DpiError
mock_ocr_mod.ocr.side_effect = DpiError("DPI too low")

item = self._create_mock_item()
ocr_main.start_processing(item)

assert item.ocr_status == OCRStatus.DPI_ERROR

def test_ocr_success_forwards_to_upload_queue(self):
"""When OCR succeeds with text, item should be forwarded to upload queue."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database'), \
patch.object(ocr_main, 'extract_text', return_value="Real text."), \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq') as mock_forward:

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = True

item = self._create_mock_item()
ocr_main.start_processing(item)

mock_forward.assert_called_once_with("upload_queue", item)

def test_ocr_db_updated_with_ocr_status(self):
"""Verify the DB final update includes both file_status and ocr_status."""
ocr_main, mock_ocrmypdf = _load_ocr_main()

with patch.object(ocr_main, 'ocrmypdf') as mock_ocr_mod, \
patch.object(ocr_main, 'update_scanneddata_database') as mock_update_db, \
patch.object(ocr_main, 'extract_text', return_value="Extracted text."), \
patch.object(ocr_main, 'os') as mock_os, \
patch.object(ocr_main, 'forward_to_rabbitmq'):

self._setup_ocr_mod(mock_ocr_mod, mock_ocrmypdf)
mock_ocr_mod.ocr.return_value = 0
mock_os.path.exists.return_value = True

item = self._create_mock_item()
ocr_main.start_processing(item)

# At least two DB calls: initial status update + final status update
assert mock_update_db.call_count >= 2
final_update = mock_update_db.call_args_list[-1][0][1]
assert "file_status" in final_update
assert "ocr_status" in final_update
assert final_update["ocr_status"] == OCRStatus.COMPLETED.name
1 change: 1 addition & 0 deletions web_service/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def callback(ch, method, properties, body):
currently_uploading=item.current_uploading,
current_upload_target=item.current_upload_target,
badges=badges, # Add the generated badges
ocr_status=item.ocr_status.name if item.ocr_status else None,
)
payload["dashboard_data"] = get_dashboard_info() # Nur bei Bedarf abrufen
sse_queue.put(json.dumps(payload, default=str)) # Ensure all objects are serializable
Expand Down
Loading