iDUM/api.py at master · being-invincible/iDUM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from fastapi import FastAPI
import os
from paddleocr import PaddleOCR, draw_ocr
import json
import fitz
import shutil
import boto3

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Converting the dictionary to a dataframe
import pandas as pd

# To eliminate KMP Kernel Error
# os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

ocr = PaddleOCR(use_angle_cls=True, lang='en')

openai.api_key = "<your-openai-key>"

# To control the randomness and creativity of the generated text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, openai_api_key=openai.api_key)

extraction_template = """
The following text are obtained from an output of top performing OCR model, from a structured or unstructured document in sequential order.
Perform information extraction by understanding certain key information and try to fetch its values (If the values are unknown, fill them as "None")

Format the output in JSON structure and convert all the keys to camel-casing:

text: {text}
"""

"""
1. invoiceNo
2. invoiceDate
3. invoiceTitle
4. gstin
5. itemDetails
4. totalAmount
5. gstAmount
6. grandTotal
7. shipToDetails
8. billToDetails
"""

app = FastAPI()

@app.get("/")
def home():
    return {"status": "200",
            "message":"Welcome!"}


@app.post("/pdf")
def upload(pdf):
    pdffolder = 'pdfFolder'

    os.makedirs(f'/{str(pdffolder)}', exist_ok=True)

    # Creating an S3 access object
    obj = boto3.client(
        "s3",
        # aws_access_key_id=ACCESS_KEY,
        # aws_secret_access_key=SECRET_KEY,
        # region=REGION
                      )
    # Downloading a csv file
    # from S3 bucket to local folder
    obj.download_file(
        Filename=f'/{str(pdffolder)}/{str(pdf)}',
        Bucket="invoice-pdfs-v01",
        Key=pdf
    )

    # Create a document object

    doc = fitz.open(f'/{str(pdffolder)}/{str(pdf)}')  # or fitz.Document(filename)

    pdf= pdf.split('.')

    os.makedirs(f'imgFolder/{pdf[0]}', exist_ok=True)

    # Render and save all the pages as images

    for i in range(doc.page_count):
        page = doc.load_page(i)
        pix = page.get_pixmap()
        pix.save(f"imgFolder/{pdf[0]}/page-%i.png" % page.number)

    st = ""
    j = 0
    for images in os.listdir(f"imgFolder/{pdf[0]}"):
        j+=1
        # check if the image ends with png
        if (images.endswith(".png")):

            result = ocr.ocr(f'imgFolder/{pdf[0]}/'+images, cls=True)


            # Empty Dictionary
            output_dict = dict()
            # Iterating through the results from Paddle OCR
            for idx in range(len(result)):
                i=0
                res = result[idx]
                for line in res:
                    # Unpacking each line
                    bbox = line[0]
                    preds, score = line[1]
                    # Adding each new dictionary with an iterator key
                    output_dict[i] = {'Bbox':bbox, 'Score':score, 'Text':preds}
                    i+=1

            # Transposing the df so we get all the records respective to the iterator key
            result_df = pd.DataFrame(output_dict).T
            result_string = result_df["Text"].to_list()
            st += 'Page - {a} \n'.format(a=j)
            st += ' '.join(str(x) for x in result_string)
            st += '\n'

    prompt_template = ChatPromptTemplate.from_template(extraction_template)

    messages = prompt_template.format_messages(text=st)
    response = chat(messages)

    res = json.loads(response.content)

    # Delete a directory
    shutil.rmtree("imgFolder")

    return res