diff --git a/.gitignore b/.gitignore index bc4c914..163470e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ /docs/source/api/ .vscode/ __pycache__/ +.venv/ +*.pdf +*.csv diff --git a/README.md b/README.md index feff076..beccf07 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,18 @@ **pdf-importer** is a PDF parser for credit card statements. It accepts statement from the following issuers: - - [Cembra & Cumulus](https://www.cembra.ch/en/cards/cembra-mastercard/) MasterCard - - [SwissCard Cashback](https://www.swisscard.ch/en/private-customers/products) (AMEX / VISA / MasterCard) +- [Cembra & Cumulus](https://www.cembra.ch/en/cards/cembra-mastercard/) MasterCard +- [SwissCard Cashback](https://www.swisscard.ch/en/private-customers/products) (AMEX / VISA / MasterCard) The data can be saved to a CSV file compatible with [Wallet by budgetbakers](https://budgetbakers.com/) import feature. ## Dependencies - - [Python 3.6](https://www.python.org/downloads/release/python-360/) and [pip 10.0](https://pip.pypa.io/en/stable/). - - [camelot-py](https://camelot-py.readthedocs.io/en/master/) and +- [Python 3.6](https://www.python.org/downloads/release/python-360/) and [pip 10.0](https://pip.pypa.io/en/stable/). +- [camelot-py](https://camelot-py.readthedocs.io/en/master/) and [opencv-python](https://github.com/opencv/opencv-python) for PDF parsing. - - [python-dateutil](https://dateutil.readthedocs.io/en/stable/) for date format management. - - [pandas](https://pandas.pydata.org/) for CSV export. +- [python-dateutil](https://dateutil.readthedocs.io/en/stable/) for date format management. +- [pandas](https://pandas.pydata.org/) for CSV export. ## Installation @@ -30,22 +30,54 @@ using [pip](https://pip.pypa.io/en/stable/): python -m pip install pdf-importer ``` +## Development Setup (Optional) + +For local development with an isolated virtual environment: + +```bash +git clone https://github.com/c-vigo/StatementPDFImporter.git +cd StatementPDFImporter +python3 -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +pip install -r requirements.txt +pip install -e . +``` + +This installs the package in editable mode with all dependencies pinned for reproducible builds. + ## Usage +### Parsing PDF statements + You can parse a PDF statement simply with ``` python -m pdf_importer [filename] [type] [-o csv_file] ``` -where - - *filename* is the full path to the PDF file - - *type* is either *cembra* or *cashback* - - *csv_file* is the full path to the CSV file where the data is saved. +where + +- *filename* is the full path to the PDF file +- *type* is either *cembra* or *cashback* +- *csv_file* is the full path to the CSV file where the data is saved. + +### Cleaning CSV files + +Bank-exported CSV files may contain double-quote characters (`"`) that are not supported by +some import tools. You can remove them with the included CSV cleaner: + +``` +python -m pdf_importer.csv_cleaner [filename] [-o output_file] +``` + +where + +- *filename* is the full path to the CSV file to clean +- *output_file* is the full path to the cleaned output file (optional; defaults to overwriting the input file). ## Authors -* [**Carlos Vigo**](mailto:carviher1990@gmail.com?subject=[GitHub%-%pdf-importer]) - *Initial work* - +- [**Carlos Vigo**](mailto:carviher1990@gmail.com?subject=[GitHub%-%pdf-importer]) - *Initial work* - [GitHub](https://github.com/c-vigo) ## Contributing @@ -55,7 +87,7 @@ conduct, and the process for submitting pull requests to us. ## Versioning -We use [Git](https://git-scm.com/) for versioning. For the versions available, see the +We use [Git](https://git-scm.com/) for versioning. For the versions available, see the [tags on this repository](https://gitlab.ethz.ch/exotic-matter/cw-beam/pdf-importer). ## License @@ -64,4 +96,4 @@ This project is licensed under the [GNU GPLv3 License](LICENSE.md) ## Built With -* [PyCharm Professional 2020](https://www.jetbrains.com/pycharm//) - The IDE used +- [PyCharm Professional 2020](https://www.jetbrains.com/pycharm//) - The IDE used diff --git a/pdf_importer/Importers.py b/pdf_importer/Importers.py index 6c07154..f7a5dee 100644 --- a/pdf_importer/Importers.py +++ b/pdf_importer/Importers.py @@ -20,7 +20,7 @@ def extract_cembra(filename): try: date = parse(row[1].strip(), dayfirst=True).date() _ = parse(row[0].strip(), dayfirst=True).date() - text = row[2] + text = row[2].replace("\n", " ") credit = row[3].replace('\'', '') debit = row[4].replace('\'', '') amount = -float(debit) if debit else float(credit) diff --git a/pdf_importer/csv_cleaner.py b/pdf_importer/csv_cleaner.py new file mode 100644 index 0000000..153d06a --- /dev/null +++ b/pdf_importer/csv_cleaner.py @@ -0,0 +1,52 @@ +"""Post-processing script to clean CSV files exported from banking tools. + +Removes all double-quote characters (") from CSV files, as some import tools +do not support them. +""" + +from argparse import ArgumentParser + + +def csv_cleaner(): + """Remove all double-quote characters from a CSV file.""" + + ap = ArgumentParser( + prog='csv_cleaner', + description='Remove double-quote characters from a CSV file', + ) + + ap.add_argument( + 'filename', + help='CSV file to clean', + type=str, + ) + + ap.add_argument( + '--o', + '-output', + dest='output', + help='output CSV file (defaults to overwriting the input file)', + type=str, + default=None, + ) + + args = ap.parse_args() + + output = args.output if args.output is not None else args.filename + + with open(args.filename, 'r', encoding='utf-8-sig') as f: + content = f.read() + + cleaned = content.replace('"', '') + + with open(output, 'w', encoding='utf-8') as f: + f.write(cleaned) + + print('Cleaned "{}"{}.'.format( + args.filename, + '' if output == args.filename else ' -> "{}"'.format(output), + )) + + +if __name__ == '__main__': + csv_cleaner() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..68b9565 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +# PDF parsing +camelot-py[base]>=0.11.0 +opencv-python>=4.5.0 + +# Data handling +pandas>=1.3.0 + +# Date parsing +python-dateutil>=2.8.0