TaxReturnFasterMaybe/pdf.py at master · HenryQuan/TaxReturnFasterMaybe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pdfplumber
import os
import re

KEYWORDS_DICT = {}

# add a keywords.txt file to config/
with open(os.path.join(os.getcwd(), "config/keywords.txt")) as keywords:
    # put # in front of a word to comment it out
    KEYWORDS = [
        word for word in keywords.read().split("\n") if word.startswith("#") == False
    ]


def pdf2txt(pdf_path: str):
    input_path = os.path.join(os.getcwd(), f"documents/{pdf_path}")
    output_path = os.path.join(os.getcwd(), f"processed/{pdf_path}.txt")
    # find numbers like 1.0, 2,312,00, 100.00, up to a trillion
    number_regex = r"\s(\d*,*\d*,*\d*,*\d*\.\d+)"
    # 01 Jan, 20 Aug, 31 Dec
    date_regex = r"(\d{2}\s[A-Z][a-z]{2})"
    with open(output_path, "w", encoding="utf-8") as output:
        with pdfplumber.open(input_path) as pdf:
            merged_line = []
            for page in pdf.pages:
                pdf_text = page.extract_text()
                lines = pdf_text.split("\n")
                merged_line.extend(lines)
            # use iterator to get a few lines before finding a new date
            start_index = None
            for index, line in enumerate(merged_line):
                if re.search(date_regex, line):
                    if start_index is None:
                        start_index = index
                    if start_index != index:
                        full_line = "|".join(merged_line[start_index:index])
                        start_index = index
                        output.write(full_line)
                        output.write("\n")

                        for keyword in KEYWORDS:
                            if keyword in full_line:
                                # this line can be extremely long, let's split it into a list
                                line_of_focus = [
                                    x for x in full_line.split("|") if keyword in x
                                ][0]
                                try:
                                    matches = re.findall(number_regex, line_of_focus)[
                                        -2:
                                    ]
                                except IndexError:
                                    matches = re.findall(number_regex, line_of_focus)[
                                        -1:
                                    ]
                                try:
                                    matches = matches[0]
                                except IndexError:
                                    matches = None

                                # try to find it in the full_line
                                if matches == None:
                                    try:
                                        matches = re.findall(number_regex, full_line)[
                                            -2:
                                        ]
                                    except IndexError:
                                        matches = re.findall(number_regex, full_line)[
                                            -1:
                                        ]
                                    try:
                                        matches = matches[0]
                                    except IndexError:
                                        print(
                                            f"The number is not found for {keyword} in {full_line}"
                                        )
                                        continue

                                if keyword not in KEYWORDS_DICT:
                                    KEYWORDS_DICT[keyword] = []
                                KEYWORDS_DICT[keyword].append(matches)


def pdfs2txt(pdf_paths: list[str]):
    for pdf_path in pdf_paths:
        pdf2txt(pdf_path)


def documentsInFolder(folder: str):
    documents = os.listdir(os.path.join(os.getcwd(), folder))
    documents = [document for document in documents if not document.startswith(".")]
    return documents


def print_results(dictionary: dict[str, list[str]]):
    for key in dictionary:
        print(key)
        for value in dictionary[key]:
            print(value)


def write_results_csv(dictionary: dict[str, list[str]]):
    with open("processed/results.csv", "w") as results:
        for key in dictionary:
            results.write(key)
            results.write(",")
            for value in dictionary[key]:
                results.write(value)
                results.write(",")
            results.write("\n")


if __name__ == "__main__":
    pdfs2txt(documentsInFolder("documents"))
    print_results(KEYWORDS_DICT)
    write_results_csv(KEYWORDS_DICT)