diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index aab1b1647f..1636f6435e 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -381,50 +381,52 @@ def convert_to_csv(elements: Iterable[Element]) -> str: @requires_dependencies(["pandas"]) def get_default_pandas_dtypes() -> dict[str, Any]: - return { - "text": pd.StringDtype(), # type: ignore - "type": pd.StringDtype(), # type: ignore - "element_id": pd.StringDtype(), # type: ignore - "filename": pd.StringDtype(), # Optional[str] # type: ignore - "filetype": pd.StringDtype(), # Optional[str] # type: ignore - "file_directory": pd.StringDtype(), # Optional[str] # type: ignore - "last_modified": pd.StringDtype(), # Optional[str] # type: ignore - "attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore - "parent_id": pd.StringDtype(), # Optional[str], # type: ignore - "category_depth": "Int64", # Optional[int] - "image_path": pd.StringDtype(), # Optional[str] # type: ignore - "languages": object, # Optional[list[str]] - "page_number": "Int64", # Optional[int] - "page_name": pd.StringDtype(), # Optional[str] # type: ignore - "url": pd.StringDtype(), # Optional[str] # type: ignore - "link_urls": pd.StringDtype(), # Optional[str] # type: ignore - "link_texts": object, # Optional[list[str]] - "links": object, - "sent_from": object, # Optional[list[str]], - "sent_to": object, # Optional[list[str]] - "subject": pd.StringDtype(), # Optional[str] # type: ignore - "section": pd.StringDtype(), # Optional[str] # type: ignore - "header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore - "emphasized_text_contents": object, # Optional[list[str]] - "emphasized_text_tags": object, # Optional[list[str]] - "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore - "max_characters": "Int64", # Optional[int] - "is_continuation": "boolean", # Optional[bool] - "detection_class_prob": float, # Optional[float], - "sender": pd.StringDtype(), # type: ignore - "coordinates_points": object, - "coordinates_system": pd.StringDtype(), # type: ignore - "coordinates_layout_width": float, - "coordinates_layout_height": float, - "data_source_url": pd.StringDtype(), # Optional[str] # type: ignore - "data_source_version": pd.StringDtype(), # Optional[str] # type: ignore - "data_source_record_locator": object, - "data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore - "data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore - "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore - "data_source_permissions_data": object, - "embeddings": object, - } + if not hasattr(get_default_pandas_dtypes, "_cache"): + get_default_pandas_dtypes._cache = { + "text": pd.StringDtype(), # type: ignore + "type": pd.StringDtype(), # type: ignore + "element_id": pd.StringDtype(), # type: ignore + "filename": pd.StringDtype(), # Optional[str] # type: ignore + "filetype": pd.StringDtype(), # Optional[str] # type: ignore + "file_directory": pd.StringDtype(), # Optional[str] # type: ignore + "last_modified": pd.StringDtype(), # Optional[str] # type: ignore + "attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore + "parent_id": pd.StringDtype(), # Optional[str], # type: ignore + "category_depth": "Int64", # Optional[int] + "image_path": pd.StringDtype(), # Optional[str] # type: ignore + "languages": object, # Optional[list[str]] + "page_number": "Int64", # Optional[int] + "page_name": pd.StringDtype(), # Optional[str] # type: ignore + "url": pd.StringDtype(), # Optional[str] # type: ignore + "link_urls": pd.StringDtype(), # Optional[str] # type: ignore + "link_texts": object, # Optional[list[str]] + "links": object, + "sent_from": object, # Optional[list[str]], + "sent_to": object, # Optional[list[str]] + "subject": pd.StringDtype(), # Optional[str] # type: ignore + "section": pd.StringDtype(), # Optional[str] # type: ignore + "header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore + "emphasized_text_contents": object, # Optional[list[str]] + "emphasized_text_tags": object, # Optional[list[str]] + "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore + "max_characters": "Int64", # Optional[int] + "is_continuation": "boolean", # Optional[bool] + "detection_class_prob": float, # Optional[float], + "sender": pd.StringDtype(), # type: ignore + "coordinates_points": object, + "coordinates_system": pd.StringDtype(), # type: ignore + "coordinates_layout_width": float, + "coordinates_layout_height": float, + "data_source_url": pd.StringDtype(), # Optional[str] # type: ignore + "data_source_version": pd.StringDtype(), # Optional[str] # type: ignore + "data_source_record_locator": object, + "data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore + "data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore + "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore + "data_source_permissions_data": object, + "embeddings": object, + } + return get_default_pandas_dtypes._cache.copy() @requires_dependencies(["pandas"])