-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtocExtractor.py
More file actions
131 lines (116 loc) · 5.64 KB
/
tocExtractor.py
File metadata and controls
131 lines (116 loc) · 5.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os.path
import pandas as pd
import PyPDF2
from bs4 import BeautifulSoup
import re
import os
import fitz
from pathlib import Path
from thefuzz import fuzz, process
import argparse
import sys
class tocExtractor():
def find_toc(self, fdir, common_keyword, toc_keyword_1, toc_keyword_2):
doc = fitz.open(fdir)
for pageNum in range(doc.pageCount):
if pageNum > 10:
break
page = doc.loadPage(pageNum)
pageText = page.getText('text')
pageHTML = page.getText('html')
if len(pageText) < 15:
continue
text = pageText.strip()
if fuzz.partial_ratio(common_keyword.lower(), text.lower()) > 90 and \
(fuzz.partial_ratio(toc_keyword_1.lower(), text.lower()) > 90 or
fuzz.partial_ratio(toc_keyword_2.lower(), text.lower()) > 90):
soup = BeautifulSoup(pageHTML, 'html.parser')
return self.extract_toc(soup)
return None
def extract_toc(self, soup):
findAllP = soup.findAll('p')
textAndStyle = [p['style'].split(';') + [f"text:{p.find('span').text}"] for p in findAllP]
textAndStyleDict = [{element.split(':')[0]: element.split(':')[1] for element in elementList} for elementList in
textAndStyle]
df_textAndStyle = pd.DataFrame(textAndStyleDict)
df_textAndStyle.sort_values('top', inplace=True)
df_groupby_textAndStyle = df_textAndStyle.groupby('top')
outlines_list = []
prev_left_positions = []
for idx, df in enumerate(df_groupby_textAndStyle):
if len(df[1]) == 1:
continue
## Group the page title and page numbers on the same position from the top.
outlinePageGroup = [row[1]['text'] for row in df[1].iterrows()]
if idx+1 == len(df_groupby_textAndStyle) and not all(df[1]['left'].isin(prev_left_positions)):
break
outlines_list.append(outlinePageGroup)
prev_left_positions = list(df[1]['left'].values)
return self.process_outline(outlines_list)
def process_outline(self, raw_outline_list):
new_outline_list = []
for outline in raw_outline_list:
title = None
pageNum = self.find_integer(outline)
englishText = self.find_eng(outline)
chineseText = self.find_chi(outline)
if pageNum is None:
continue
elif englishText is None:
if chineseText:
title = chineseText
else:
continue
elif chineseText is None:
title = englishText
new_outline_list.append([title, pageNum + 1])
return new_outline_list
def find_integer(self, input_list):
for element in input_list:
try:
pageNum = element.strip()
return int(pageNum)
except:
continue
for element in input_list:
## pattern: 4-8
if re.search(r'[0-9]{1,5}[\s]*-[\s]*[0-9\s]{1,5}', element):
pageNum = re.findall(r'[0-9]{1,5}[\s]*-[\s]*[0-9\s]{1,5}', element)[0].split("-")[0].strip()
return int(pageNum)
## pattern: 4 to 8
if re.search(r'[0-9]{1,5}[\s]*to[\s]*[0-9\s]{1,5}', element):
pageNum = re.findall(r'[0-9]{1,5}[\s]*to[\s]*[0-9\s]{1,5}', element)[0].split("to")[0].strip()
return int(pageNum)
return None
def find_eng(self, input_list):
for element in input_list:
if re.search(r'[a-zA-Z\s]+', element):
return "".join(re.findall(r'[a-zA-Z\s]+', element)).strip()
def find_chi(self, input_list):
for element in input_list:
if re.search(r'[\u4e00-\u9fff]+', element):
return "".join(re.findall(r'[\u4e00-\u9fff]+', element)).strip()
def pdf_extract_page(self, pdfPath, pageNum, tocPath):
if not os.path.exists(tocPath):
os.mkdir(tocPath)
with open(pdfPath, 'rb') as read_stream:
pdf_reader = PyPDF2.PdfFileReader(read_stream)
pdf_writer = PyPDF2.PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(pageNum))
ouput = os.path.join(tocPath, f'{Path(pdfPath).stem}_page_{pageNum}')
with open(ouput, 'wb') as out:
pdf_writer.write(out)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="This script extracts Table of Contents from PDF files using keywords."
" Tested on Hong Kong Stock Exchange (HKEX) reports.")
parser.add_argument("--fileDir", "-f", required=True, help='Full file location')
parser.add_argument("--common_keyword", "-c", required=True, help='Common keyword found in the ToC section of '
'reports. Example: "Management Discussion and Analysis"')
parser.add_argument("--toc_keyword_1", "-k1", required=True, help='Keyword of the ToC title. Example: "Table of '
'Contents" or "Contents"')
parser.add_argument("--toc_keyword_2", "-k2", required=True, help='Keyword of the ToC title different to -k1. '
'Example: "Table of Contents" or "Contents"')
args = parser.parse_args(sys.argv[1:])
extractor = tocExtractor()
pdf_outline = extractor.find_toc(args.fileDir, args.common_keyword, args.toc_keyword_1, args.toc_keyword_2)
print(pdf_outline)