-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathapi.py
More file actions
132 lines (100 loc) · 3.55 KB
/
api.py
File metadata and controls
132 lines (100 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from fastapi import FastAPI
import os
from paddleocr import PaddleOCR, draw_ocr
import json
import fitz
import shutil
import boto3
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
# Converting the dictionary to a dataframe
import pandas as pd
# To eliminate KMP Kernel Error
# os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
ocr = PaddleOCR(use_angle_cls=True, lang='en')
openai.api_key = "<your-openai-key>"
# To control the randomness and creativity of the generated text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, openai_api_key=openai.api_key)
extraction_template = """
The following text are obtained from an output of top performing OCR model, from a structured or unstructured document in sequential order.
Perform information extraction by understanding certain key information and try to fetch its values (If the values are unknown, fill them as "None")
Format the output in JSON structure and convert all the keys to camel-casing:
text: {text}
"""
"""
1. invoiceNo
2. invoiceDate
3. invoiceTitle
4. gstin
5. itemDetails
4. totalAmount
5. gstAmount
6. grandTotal
7. shipToDetails
8. billToDetails
"""
app = FastAPI()
@app.get("/")
def home():
return {"status": "200",
"message":"Welcome!"}
@app.post("/pdf")
def upload(pdf):
pdffolder = 'pdfFolder'
os.makedirs(f'/{str(pdffolder)}', exist_ok=True)
# Creating an S3 access object
obj = boto3.client(
"s3",
# aws_access_key_id=ACCESS_KEY,
# aws_secret_access_key=SECRET_KEY,
# region=REGION
)
# Downloading a csv file
# from S3 bucket to local folder
obj.download_file(
Filename=f'/{str(pdffolder)}/{str(pdf)}',
Bucket="invoice-pdfs-v01",
Key=pdf
)
# Create a document object
doc = fitz.open(f'/{str(pdffolder)}/{str(pdf)}') # or fitz.Document(filename)
pdf= pdf.split('.')
os.makedirs(f'imgFolder/{pdf[0]}', exist_ok=True)
# Render and save all the pages as images
for i in range(doc.page_count):
page = doc.load_page(i)
pix = page.get_pixmap()
pix.save(f"imgFolder/{pdf[0]}/page-%i.png" % page.number)
st = ""
j = 0
for images in os.listdir(f"imgFolder/{pdf[0]}"):
j+=1
# check if the image ends with png
if (images.endswith(".png")):
result = ocr.ocr(f'imgFolder/{pdf[0]}/'+images, cls=True)
# Empty Dictionary
output_dict = dict()
# Iterating through the results from Paddle OCR
for idx in range(len(result)):
i=0
res = result[idx]
for line in res:
# Unpacking each line
bbox = line[0]
preds, score = line[1]
# Adding each new dictionary with an iterator key
output_dict[i] = {'Bbox':bbox, 'Score':score, 'Text':preds}
i+=1
# Transposing the df so we get all the records respective to the iterator key
result_df = pd.DataFrame(output_dict).T
result_string = result_df["Text"].to_list()
st += 'Page - {a} \n'.format(a=j)
st += ' '.join(str(x) for x in result_string)
st += '\n'
prompt_template = ChatPromptTemplate.from_template(extraction_template)
messages = prompt_template.format_messages(text=st)
response = chat(messages)
res = json.loads(response.content)
# Delete a directory
shutil.rmtree("imgFolder")
return res