-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocx_utils.py
More file actions
186 lines (155 loc) · 6.62 KB
/
docx_utils.py
File metadata and controls
186 lines (155 loc) · 6.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import re
from tika import parser
from bs4 import BeautifulSoup, NavigableString, Tag
from werkzeug.utils import secure_filename
from docx import Document
from docx.shared import RGBColor
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from io import BytesIO
import requests
from zipfile import ZipFile
from xml.etree.ElementTree import XML
def get_html_from_docx(file_path):
# Parse the DOCX file
parsed = parser.from_file(file_path, xmlContent=True)
# Extract HTML content
html_content = parsed.get('content')
return html_content
def extractText(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
def getText(element):
if isinstance(element, NavigableString):
return str(element)
elif isinstance(element, Tag):
text = ''
for child in element.contents:
text += getText(child)
return text
return getText(soup)
def replace_substrings(main_string, replacements):
# pattern = re.compile("|".join(re.escape(key) for key in replacements.keys()))
pattern = re.compile(r"\b(" + "|".join(re.escape(key) for key in replacements.keys()) + r")\b")
# Function to replace matched substrings using the dictionary
def replace_match(match):
return replacements[match.group(0)]
# Use sub method to replace all matches in one go
result = pattern.sub(replace_match, main_string)
return result
def modify_html_content(mapDict, html_content):
print(len(mapDict))
soup = BeautifulSoup(html_content, "html.parser")
def iterate_html(element):
if isinstance(element, NavigableString):
# print(element.string)
# print(replace_substrings(element.string, mapDict))
p = False
if "Rukh Khan" in str(element.string):
print(element.string)
p = True
new_string = replace_substrings(element.string, mapDict)
element.replace_with(new_string)
if p:
print(element.string)
print(mapDict["Rukh Khan"])
print("\n\n")
# print(element.string)
# print("\n\n")
elif isinstance(element, Tag):
for child in element.contents:
iterate_html(child)
iterate_html(soup)
return str(soup)
def export_to_docx(original_file_path, mapDict, uploadFolder):
original_ext = original_file_path.rsplit(".", 1)[-1].lower()
temp_file = secure_filename("modified_" + os.path.basename(original_file_path))
temp_file_path = os.path.join(uploadFolder, temp_file)
html_content = get_html_from_docx(original_file_path)
modified_html_content = modify_html_content(mapDict, html_content)
html_to_docx(original_file_path, modified_html_content, temp_file_path)
return temp_file_path, "application/msword"
def extract_embedded_images(docx_file):
embedded_images = {}
with ZipFile(docx_file, 'r') as docx:
for entry in docx.namelist():
if entry.startswith('word/media/'):
image_data = docx.read(entry)
embedded_images[entry] = image_data
return embedded_images
def add_hyperlink(paragraph, url, text):
# This function adds a hyperlink to a paragraph.
part = paragraph.part
r_id = part.relate_to(url, qn('http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'), is_external=True)
hyperlink = OxmlElement('w:hyperlink')
hyperlink.set(qn('r:id'), r_id,)
new_run = OxmlElement('w:r')
rPr = OxmlElement('w:rPr')
# This makes the text appear blue and underlined
rStyle = OxmlElement('w:rStyle')
rStyle.set(qn('w:val'), 'Hyperlink')
rPr.append(rStyle)
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._element.append(hyperlink)
def add_html_to_docx(html_content, doc, embedded_images):
soup = BeautifulSoup(html_content, 'html.parser')
def add_element_to_paragraph(element, paragraph):
if isinstance(element, str):
paragraph.add_run(element)
elif element.name == 'b':
run = paragraph.add_run(element.get_text())
run.bold = True
elif element.name == 'i':
run = paragraph.add_run(element.get_text())
run.italic = True
elif element.name == 'u':
run = paragraph.add_run(element.get_text())
run.underline = True
elif element.name == 'a':
run = paragraph.add_run(element.get_text())
run.font.color.rgb = RGBColor(0, 0, 255)
run.font.underline = True
# add_hyperlink(paragraph, element['href'], element.get_text())
elif element.name == 'img':
try:
img_url = element['src']
if img_url.startswith('data:image'):
image_data = re.sub('^data:image/.+;base64,', '', img_url)
image = BytesIO(base64.b64decode(image_data))
doc.add_picture(image)
elif img_url.startswith('embedded:'):
image_key = 'word/media/' + img_url.split(':')[1]
if image_key in embedded_images:
image = BytesIO(embedded_images[image_key])
doc.add_picture(image)
else:
response = requests.get(img_url)
image = BytesIO(response.content)
doc.add_picture(image)
except Exception as e:
print(f"Error adding image ({img_url}): {e}")
def add_elements_to_doc(elements, doc):
for element in elements:
if element.name == 'p':
paragraph = doc.add_paragraph()
for child in element.children:
add_element_to_paragraph(child, paragraph)
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(element.name[1])
doc.add_heading(element.get_text(), level=level)
elif element.name == 'img':
add_element_to_paragraph(element, doc.add_paragraph())
elif isinstance(element, str):
doc.add_paragraph(element)
add_elements_to_doc(soup.body.contents, doc)
def html_to_docx(original_file_path, html_content, output_file):
# Create a new Document
doc = Document()
# Extract embedded images from DOCX
embedded_images = extract_embedded_images(original_file_path)
# Add HTML content to the DOCX document
add_html_to_docx(html_content, doc, embedded_images)
# Save the DOCX document
doc.save(output_file)