Skip to content

Commit fb498a4

Browse files
authored
Merge pull request #207 from Eyobyb/fix/pypdf2
changed the pdfplumber with pypdf2
2 parents 70f3b62 + ef38bc8 commit fb498a4

2 files changed

Lines changed: 12 additions & 6 deletions

File tree

src/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ pinecone-client = "^2.2.2"
1818
beautifulsoup4 = "4.12.2"
1919
markdown = ">=3.4.4,<3.5.0"
2020
loguru = ">=0.7.0,<0.8.0"
21-
pdfplumber = "0.10.3"
21+
pypdf2 = "^3.0.1"
22+
2223

2324

2425
[tool.poetry.group.test.dependencies]

src/sherpa_ai/utils.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
import sherpa_ai.config as cfg
1515
from sherpa_ai.models.sherpa_base_model import SherpaOpenAI
1616

17-
import pdfplumber
18-
17+
import PyPDF2
1918
def load_files(files: List[str]) -> List[Document]:
2019
documents = []
2120
loader = None
@@ -247,7 +246,13 @@ def show_commands_only(logs):
247246

248247
def extract_text_from_pdf(pdf_path):
249248
text = ""
250-
with pdfplumber.open(pdf_path) as pdf:
251-
for page in pdf.pages:
252-
text += page.extract_text()
249+
# Extract text from a PDF using PdfReader
250+
pdf_file = open(pdf_path, "rb")
251+
pdf_reader = PyPDF2.PdfReader(pdf_file)
252+
253+
text = ""
254+
for page in pdf_reader.pages:
255+
text += page.extract_text()
256+
257+
pdf_file.close()
253258
return text

0 commit comments

Comments
 (0)