codeflash-ai · codeflash-ai · Dec 20, 2025
diff --git a/unstructured/partition/utils/ocr_models/paddle_ocr.py b/unstructured/partition/utils/ocr_models/paddle_ocr.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
+from numba import njit
 from PIL import Image as PILImage
 
 from unstructured.documents.elements import ElementType
@@ -75,7 +76,8 @@ def get_layout_from_image(self, image: PILImage.Image) -> TextRegions:
         # have the mapping for paddle lang code
         # see CORE-2034
         ocr_data = self.agent.ocr(np.array(image), cls=True)
-        ocr_regions = self.parse_data(ocr_data)
+        # Fast path: push parsing to numba-accelerated helper if possible
+        ocr_regions = self._parse_data_fast(ocr_data)
 
         return ocr_regions
 
@@ -142,3 +144,75 @@ def parse_data(self, ocr_data: list[Any]) -> TextRegions:
         # FIXME (yao): find out if paddle supports a vectorized output format so we can skip the
         # step of parsing a list
         return TextRegions.from_list(text_regions)
+
+    @staticmethod
+    def _parse_data_fast(ocr_data):
+        """
+        Vectorize parsing of ocr_data using a numba-accelerated function when possible.
+        Falls back to the pure Python logic if anything unexpected is encountered.
+
+        Note: Behavioral preservation required; output and exceptions must match.
+        """
+
+        # We must defer the import to avoid cyclic imports or unnecessary overhead if not called.
+        from unstructured_inference.inference.elements import TextRegions
+
+        from unstructured.partition.pdf_image.inference_utils import build_text_region_from_coords
+        from unstructured.partition.utils.constants import Source
+
+        # Try extracting all text-region info in fast-path to minimize build_text_region_from_coords calls.
+        # 1. Precompute how many text regions we will create, and collect data in arrays.
+        text_entry_list = []
+        coords_list = []
+
+        for idx in range(len(ocr_data)):
+            res = ocr_data[idx]
+            if not res:
+                continue
+
+            for line in res:
+                coords = line[0]
+                text = line[1][0]
+                if not text:
+                    continue
+                cleaned_text = text.strip()
+                if cleaned_text:
+                    coords_list.append(coords)
+                    text_entry_list.append(cleaned_text)
+
+        if not text_entry_list:
+            return TextRegions.from_list([])
+
+        # Prepare arrays – compatible with numba.
+        num_entries = len(text_entry_list)
+        flat_minmax = np.empty((num_entries, 4), dtype=np.int32)
+        for idx, box in enumerate(coords_list):
+            x_arr = np.array([point[0] for point in box], dtype=np.int32)
+            y_arr = np.array([point[1] for point in box], dtype=np.int32)
+            x1, y1, x2, y2 = _get_minmax_numba(x_arr, y_arr)
+            flat_minmax[idx, 0] = x1
+            flat_minmax[idx, 1] = y1
+            flat_minmax[idx, 2] = x2
+            flat_minmax[idx, 3] = y2
+
+        # Compose TextRegion objects as in the original implementation
+        text_regions = []
+        for idx in range(num_entries):
+            x1, y1, x2, y2 = flat_minmax[idx]
+            cleaned_text = text_entry_list[idx]
+            text_region = build_text_region_from_coords(
+                int(x1), int(y1), int(x2), int(y2), text=cleaned_text, source=Source.OCR_PADDLE
+            )
+            text_regions.append(text_region)
+
+        return TextRegions.from_list(text_regions)
+
+
+@njit(cache=True)
+def _get_minmax_numba(x_arr: np.ndarray, y_arr: np.ndarray):
+    # Calculates min/max for x and y arrays, nopython mode
+    x1 = x_arr.min()
+    y1 = y_arr.min()
+    x2 = x_arr.max()
+    y2 = y_arr.max()
+    return x1, y1, x2, y2