GhostDoc/docx_utils.py at master · rattle99/GhostDoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import re
from tika import parser
from bs4 import BeautifulSoup, NavigableString, Tag
from werkzeug.utils import secure_filename
from docx import Document
from docx.shared import RGBColor
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from io import BytesIO
import requests
from zipfile import ZipFile
from xml.etree.ElementTree import XML

def get_html_from_docx(file_path):
    # Parse the DOCX file
    parsed = parser.from_file(file_path, xmlContent=True)

    # Extract HTML content
    html_content = parsed.get('content')

    return html_content

def extractText(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    def getText(element):
        if isinstance(element, NavigableString):
            return str(element)
        elif isinstance(element, Tag):
            text = ''
            for child in element.contents:
                text += getText(child)
            return text

    return getText(soup)

def replace_substrings(main_string, replacements):
    # pattern = re.compile("|".join(re.escape(key) for key in replacements.keys()))
    pattern = re.compile(r"\b(" + "|".join(re.escape(key) for key in replacements.keys()) + r")\b")

    # Function to replace matched substrings using the dictionary
    def replace_match(match):
        return replacements[match.group(0)]

    # Use sub method to replace all matches in one go
    result = pattern.sub(replace_match, main_string)
    return result

def modify_html_content(mapDict, html_content):
    print(len(mapDict))
    soup = BeautifulSoup(html_content, "html.parser")

    def iterate_html(element):
        if isinstance(element, NavigableString):
            # print(element.string)
            # print(replace_substrings(element.string, mapDict))
            p = False
            if "Rukh Khan" in str(element.string):
                print(element.string)
                p = True
            new_string = replace_substrings(element.string, mapDict)
            element.replace_with(new_string)
            if p:
                print(element.string)
                print(mapDict["Rukh Khan"])
                print("\n\n")
            # print(element.string)
            # print("\n\n")

        elif isinstance(element, Tag):
            for child in element.contents:
                iterate_html(child)

    iterate_html(soup)
    return str(soup)

def export_to_docx(original_file_path, mapDict, uploadFolder):
    original_ext = original_file_path.rsplit(".", 1)[-1].lower()
    temp_file = secure_filename("modified_" + os.path.basename(original_file_path))
    temp_file_path = os.path.join(uploadFolder, temp_file)

    html_content = get_html_from_docx(original_file_path)
    modified_html_content = modify_html_content(mapDict, html_content)
    html_to_docx(original_file_path, modified_html_content, temp_file_path)
    return temp_file_path, "application/msword"

def extract_embedded_images(docx_file):
    embedded_images = {}

    with ZipFile(docx_file, 'r') as docx:
        for entry in docx.namelist():
            if entry.startswith('word/media/'):
                image_data = docx.read(entry)
                embedded_images[entry] = image_data

    return embedded_images

def add_hyperlink(paragraph, url, text):
    # This function adds a hyperlink to a paragraph.
    part = paragraph.part
    r_id = part.relate_to(url, qn('http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink'), is_external=True)

    hyperlink = OxmlElement('w:hyperlink')
    hyperlink.set(qn('r:id'), r_id,)

    new_run = OxmlElement('w:r')
    rPr = OxmlElement('w:rPr')

    # This makes the text appear blue and underlined
    rStyle = OxmlElement('w:rStyle')
    rStyle.set(qn('w:val'), 'Hyperlink')
    rPr.append(rStyle)
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    paragraph._element.append(hyperlink)

def add_html_to_docx(html_content, doc, embedded_images):
    soup = BeautifulSoup(html_content, 'html.parser')

    def add_element_to_paragraph(element, paragraph):
        if isinstance(element, str):
            paragraph.add_run(element)
        elif element.name == 'b':
            run = paragraph.add_run(element.get_text())
            run.bold = True
        elif element.name == 'i':
            run = paragraph.add_run(element.get_text())
            run.italic = True
        elif element.name == 'u':
            run = paragraph.add_run(element.get_text())
            run.underline = True
        elif element.name == 'a':
            run = paragraph.add_run(element.get_text())
            run.font.color.rgb = RGBColor(0, 0, 255)
            run.font.underline = True
            # add_hyperlink(paragraph, element['href'], element.get_text())
        elif element.name == 'img':
            try:
                img_url = element['src']
                if img_url.startswith('data:image'):
                    image_data = re.sub('^data:image/.+;base64,', '', img_url)
                    image = BytesIO(base64.b64decode(image_data))
                    doc.add_picture(image)
                elif img_url.startswith('embedded:'):
                    image_key = 'word/media/' + img_url.split(':')[1]
                    if image_key in embedded_images:
                        image = BytesIO(embedded_images[image_key])
                        doc.add_picture(image)
                else:
                    response = requests.get(img_url)
                    image = BytesIO(response.content)
                    doc.add_picture(image)
            except Exception as e:
                print(f"Error adding image ({img_url}): {e}")

    def add_elements_to_doc(elements, doc):
        for element in elements:
            if element.name == 'p':
                paragraph = doc.add_paragraph()
                for child in element.children:
                    add_element_to_paragraph(child, paragraph)
            elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(element.name[1])
                doc.add_heading(element.get_text(), level=level)
            elif element.name == 'img':
                add_element_to_paragraph(element, doc.add_paragraph())
            elif isinstance(element, str):
                doc.add_paragraph(element)

    add_elements_to_doc(soup.body.contents, doc)

def html_to_docx(original_file_path, html_content, output_file):
    # Create a new Document
    doc = Document()

    # Extract embedded images from DOCX
    embedded_images = extract_embedded_images(original_file_path)

    # Add HTML content to the DOCX document
    add_html_to_docx(html_content, doc, embedded_images)

    # Save the DOCX document
    doc.save(output_file)