-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathText_Extract.py
More file actions
24 lines (21 loc) · 864 Bytes
/
Text_Extract.py
File metadata and controls
24 lines (21 loc) · 864 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from PyPDF2 import PdfReader
import config
# Extract text from each page of the PDF
def extract_pdf_text(file_path):
reader = PdfReader(file_path)
text = ''
for page in reader.pages:
text += page.extract_text() # Extract text from page
return text
# Write the extracted text to a file
def write_text_to_file(text, output_file):
with open(output_file, 'w', encoding='utf-8') as file:
file.write(text)
# Extract text from PDF and save to a text file
def extract_and_save_text(input_pdf=None, output_txt=config.EXTRACTED_TEXT_PATH):
input_pdf = input_pdf or config.PDF_INPUT_PATH # Use provided path or default
if input_pdf:
pdf_text = extract_pdf_text(input_pdf) # Extract text from the PDF
write_text_to_file(pdf_text, output_txt) # Save text to file
else:
pass