Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 33 additions & 32 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,39 @@
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import is_temp_file_path, lazyproperty

STYLE_TO_ELEMENT_MAPPING = {
"Caption": Text, # TODO(robinson) - add caption element type
"Heading 1": Title,
"Heading 2": Title,
"Heading 3": Title,
"Heading 4": Title,
"Heading 5": Title,
"Heading 6": Title,
"Heading 7": Title,
"Heading 8": Title,
"Heading 9": Title,
"Intense Quote": Text, # TODO(robinson) - add quote element type
"List": ListItem,
"List 2": ListItem,
"List 3": ListItem,
"List Bullet": ListItem,
"List Bullet 2": ListItem,
"List Bullet 3": ListItem,
"List Continue": ListItem,
"List Continue 2": ListItem,
"List Continue 3": ListItem,
"List Number": ListItem,
"List Number 2": ListItem,
"List Number 3": ListItem,
"List Paragraph": ListItem,
"Macro Text": Text,
"No Spacing": Text,
"Quote": Text, # TODO(robinson) - add quote element type
"Subtitle": Title,
"TOCHeading": Title,
"Title": Title,
}

DETECTION_ORIGIN: str = "docx"
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
BlockElement: TypeAlias = "CT_P | CT_Tbl"
Expand Down Expand Up @@ -925,38 +958,6 @@ def _style_based_element_type(self, paragraph: Paragraph) -> Type[Text] | None:
# NOTE(robinson) - documentation on built-in styles at the link below:
# https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \
# #paragraph-styles-in-default-template
STYLE_TO_ELEMENT_MAPPING = {
"Caption": Text, # TODO(robinson) - add caption element type
"Heading 1": Title,
"Heading 2": Title,
"Heading 3": Title,
"Heading 4": Title,
"Heading 5": Title,
"Heading 6": Title,
"Heading 7": Title,
"Heading 8": Title,
"Heading 9": Title,
"Intense Quote": Text, # TODO(robinson) - add quote element type
"List": ListItem,
"List 2": ListItem,
"List 3": ListItem,
"List Bullet": ListItem,
"List Bullet 2": ListItem,
"List Bullet 3": ListItem,
"List Continue": ListItem,
"List Continue 2": ListItem,
"List Continue 3": ListItem,
"List Number": ListItem,
"List Number 2": ListItem,
"List Number 3": ListItem,
"List Paragraph": ListItem,
"Macro Text": Text,
"No Spacing": Text,
"Quote": Text, # TODO(robinson) - add quote element type
"Subtitle": Title,
"TOCHeading": Title,
"Title": Title,
}

# -- paragraph.style can be None in rare cases, so can style.name. That's going
# -- to mean default style which is equivalent to "Normal" for our purposes.
Expand Down