-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_parser.py
More file actions
28 lines (23 loc) · 814 Bytes
/
pdf_parser.py
File metadata and controls
28 lines (23 loc) · 814 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
from io import BytesIO
import pypdf
def extract_pdf_txt(url):
"""
Extracts text from a PDF file from a given URL using pypdf library.
Args:
url (str): The URL of the PDF file.
Returns:
str: The extracted text from the PDF file.
"""
try:
response = requests.get(url)
response.raise_for_status() # raise an error if the request was unsuccessful
with BytesIO(response.content) as pdf_bytes:
pdf_reader = pypdf.PdfReader(pdf_bytes)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
#print(text)
return text #return extracted text
except requests.exceptions.RequestException:
return "Error downloading PDF in pdf_parser.py."