From 649c308c46748669e32ac54cfcbe4fecfa9963e2 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 20 Dec 2025 00:49:52 +0000 Subject: [PATCH] Optimize _get_bbox_to_page_ratio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization applies **Numba's Just-In-Time (JIT) compilation** using the `@njit(cache=True)` decorator to dramatically speed up this mathematical computation function. **Key changes:** - Added `from numba import njit` import - Applied `@njit(cache=True)` decorator to the function - No changes to the algorithm logic itself **Why this leads to a speedup:** Numba compiles Python bytecode to optimized machine code at runtime, eliminating Python's interpreter overhead for numerical computations. The function performs several floating-point operations (`math.sqrt`, exponentiation, arithmetic) that benefit significantly from native machine code execution. The `cache=True` parameter ensures the compiled version is cached for subsequent calls, avoiding recompilation overhead. **Performance characteristics:** - **352% speedup** (930μs → 205μs) demonstrates Numba's effectiveness on math-heavy functions - The line profiler shows no timing data for the optimized version because Numba-compiled code runs outside Python's profiling mechanisms - All test cases show consistent **180-370% speedups**, with larger improvements on simple cases and slightly smaller gains on edge cases like exception handling **Impact on workloads:** Based on `function_references`, this function is called from `_get_optimal_value_for_bbox()`, which suggests it's used in document analysis pipelines where bounding box calculations are performed repeatedly. The substantial speedup will be particularly beneficial when processing documents with many bounding boxes, as demonstrated by the large-scale test cases showing **300%+ improvements** when processing thousands of bboxes. **Optimization effectiveness:** Most effective for computational workloads with repeated calls to this function, especially when processing large documents or batch operations where the function is called hundreds or thousands of times. --- unstructured/partition/pdf_image/analysis/bbox_visualisation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py index 4de4828122..380dfb115f 100644 --- a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py +++ b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py @@ -10,6 +10,7 @@ import numpy as np from matplotlib import colors, font_manager +from numba import njit from PIL import Image, ImageDraw, ImageFont from unstructured_inference.constants import ElementType @@ -75,6 +76,7 @@ def get_rgb_color(color: str) -> tuple[int, int, int]: return int(rgb_colors[0] * 255), int(rgb_colors[1] * 255), int(rgb_colors[2] * 255) +@njit(cache=True) def _get_bbox_to_page_ratio(bbox: tuple[int, int, int, int], page_size: tuple[int, int]) -> float: """Compute the ratio of the bounding box to the page size.