From 91f502d4342c71518f9834b0a4a2d0eb8bd223dc Mon Sep 17 00:00:00 2001 From: Sven Scheffel Date: Tue, 3 Feb 2026 09:57:20 +0100 Subject: [PATCH 1/3] Add requirements.txt and document venv development setup - Added requirements.txt with version-pinned dependencies for reproducible installs - Added 'Development Setup (Optional)' section to README.md for venv-based local development - Enables reproducible development environments across different machines - Follows Python packaging best practices for development workflows --- README.md | 42 +++++++++++++++++++++++++++++------------- requirements.txt | 9 +++++++++ 2 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index feff076..2883280 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,18 @@ **pdf-importer** is a PDF parser for credit card statements. It accepts statement from the following issuers: - - [Cembra & Cumulus](https://www.cembra.ch/en/cards/cembra-mastercard/) MasterCard - - [SwissCard Cashback](https://www.swisscard.ch/en/private-customers/products) (AMEX / VISA / MasterCard) +- [Cembra & Cumulus](https://www.cembra.ch/en/cards/cembra-mastercard/) MasterCard +- [SwissCard Cashback](https://www.swisscard.ch/en/private-customers/products) (AMEX / VISA / MasterCard) The data can be saved to a CSV file compatible with [Wallet by budgetbakers](https://budgetbakers.com/) import feature. ## Dependencies - - [Python 3.6](https://www.python.org/downloads/release/python-360/) and [pip 10.0](https://pip.pypa.io/en/stable/). - - [camelot-py](https://camelot-py.readthedocs.io/en/master/) and +- [Python 3.6](https://www.python.org/downloads/release/python-360/) and [pip 10.0](https://pip.pypa.io/en/stable/). +- [camelot-py](https://camelot-py.readthedocs.io/en/master/) and [opencv-python](https://github.com/opencv/opencv-python) for PDF parsing. - - [python-dateutil](https://dateutil.readthedocs.io/en/stable/) for date format management. - - [pandas](https://pandas.pydata.org/) for CSV export. +- [python-dateutil](https://dateutil.readthedocs.io/en/stable/) for date format management. +- [pandas](https://pandas.pydata.org/) for CSV export. ## Installation @@ -30,6 +30,21 @@ using [pip](https://pip.pypa.io/en/stable/): python -m pip install pdf-importer ``` +## Development Setup (Optional) + +For local development with an isolated virtual environment: + +```bash +git clone https://github.com/c-vigo/StatementPDFImporter.git +cd StatementPDFImporter +python3 -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +pip install -r requirements.txt +pip install -e . +``` + +This installs the package in editable mode with all dependencies pinned for reproducible builds. + ## Usage You can parse a PDF statement simply with @@ -37,15 +52,16 @@ You can parse a PDF statement simply with ``` python -m pdf_importer [filename] [type] [-o csv_file] ``` -where - - *filename* is the full path to the PDF file - - *type* is either *cembra* or *cashback* - - *csv_file* is the full path to the CSV file where the data is saved. +where + +- *filename* is the full path to the PDF file +- *type* is either *cembra* or *cashback* +- *csv_file* is the full path to the CSV file where the data is saved. ## Authors -* [**Carlos Vigo**](mailto:carviher1990@gmail.com?subject=[GitHub%-%pdf-importer]) - *Initial work* - +- [**Carlos Vigo**](mailto:carviher1990@gmail.com?subject=[GitHub%-%pdf-importer]) - *Initial work* - [GitHub](https://github.com/c-vigo) ## Contributing @@ -55,7 +71,7 @@ conduct, and the process for submitting pull requests to us. ## Versioning -We use [Git](https://git-scm.com/) for versioning. For the versions available, see the +We use [Git](https://git-scm.com/) for versioning. For the versions available, see the [tags on this repository](https://gitlab.ethz.ch/exotic-matter/cw-beam/pdf-importer). ## License @@ -64,4 +80,4 @@ This project is licensed under the [GNU GPLv3 License](LICENSE.md) ## Built With -* [PyCharm Professional 2020](https://www.jetbrains.com/pycharm//) - The IDE used +- [PyCharm Professional 2020](https://www.jetbrains.com/pycharm//) - The IDE used diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..68b9565 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +# PDF parsing +camelot-py[base]>=0.11.0 +opencv-python>=4.5.0 + +# Data handling +pandas>=1.3.0 + +# Date parsing +python-dateutil>=2.8.0 From 2aad8d64711a4b92bf0e829f7fa6558ea13108d1 Mon Sep 17 00:00:00 2001 From: Sven Scheffel Date: Mon, 2 Mar 2026 09:58:52 +0100 Subject: [PATCH 2/3] Add CSV cleaner script and update README with usage sections Add a post-processing script (csv_cleaner.py) that strips double-quote characters from bank-exported CSV files for compatibility with import tools. Update README to document both PDF parsing and CSV cleaning use cases. Add *.pdf and *.csv to .gitignore. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 3 +++ README.md | 16 ++++++++++++ pdf_importer/csv_cleaner.py | 52 +++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 pdf_importer/csv_cleaner.py diff --git a/.gitignore b/.gitignore index bc4c914..163470e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ /docs/source/api/ .vscode/ __pycache__/ +.venv/ +*.pdf +*.csv diff --git a/README.md b/README.md index 2883280..beccf07 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ This installs the package in editable mode with all dependencies pinned for repr ## Usage +### Parsing PDF statements + You can parse a PDF statement simply with ``` @@ -59,6 +61,20 @@ where - *type* is either *cembra* or *cashback* - *csv_file* is the full path to the CSV file where the data is saved. +### Cleaning CSV files + +Bank-exported CSV files may contain double-quote characters (`"`) that are not supported by +some import tools. You can remove them with the included CSV cleaner: + +``` +python -m pdf_importer.csv_cleaner [filename] [-o output_file] +``` + +where + +- *filename* is the full path to the CSV file to clean +- *output_file* is the full path to the cleaned output file (optional; defaults to overwriting the input file). + ## Authors - [**Carlos Vigo**](mailto:carviher1990@gmail.com?subject=[GitHub%-%pdf-importer]) - *Initial work* - diff --git a/pdf_importer/csv_cleaner.py b/pdf_importer/csv_cleaner.py new file mode 100644 index 0000000..153d06a --- /dev/null +++ b/pdf_importer/csv_cleaner.py @@ -0,0 +1,52 @@ +"""Post-processing script to clean CSV files exported from banking tools. + +Removes all double-quote characters (") from CSV files, as some import tools +do not support them. +""" + +from argparse import ArgumentParser + + +def csv_cleaner(): + """Remove all double-quote characters from a CSV file.""" + + ap = ArgumentParser( + prog='csv_cleaner', + description='Remove double-quote characters from a CSV file', + ) + + ap.add_argument( + 'filename', + help='CSV file to clean', + type=str, + ) + + ap.add_argument( + '--o', + '-output', + dest='output', + help='output CSV file (defaults to overwriting the input file)', + type=str, + default=None, + ) + + args = ap.parse_args() + + output = args.output if args.output is not None else args.filename + + with open(args.filename, 'r', encoding='utf-8-sig') as f: + content = f.read() + + cleaned = content.replace('"', '') + + with open(output, 'w', encoding='utf-8') as f: + f.write(cleaned) + + print('Cleaned "{}"{}.'.format( + args.filename, + '' if output == args.filename else ' -> "{}"'.format(output), + )) + + +if __name__ == '__main__': + csv_cleaner() From b1a48ed77808f4a8f44fef08f9a0526aeca27e17 Mon Sep 17 00:00:00 2001 From: Sven Scheffel Date: Mon, 1 Jun 2026 13:51:33 +0200 Subject: [PATCH 3/3] Merge multi-line Cembra entries into a single line The Cembra importer kept the PDF's original line breaks in the booking text, producing CSV records that spanned multiple lines (e.g. foreign currency rows). Replace embedded newlines with spaces, matching the existing behaviour in extract_cashback. Co-Authored-By: Claude Opus 4.8 (1M context) --- pdf_importer/Importers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdf_importer/Importers.py b/pdf_importer/Importers.py index 6c07154..f7a5dee 100644 --- a/pdf_importer/Importers.py +++ b/pdf_importer/Importers.py @@ -20,7 +20,7 @@ def extract_cembra(filename): try: date = parse(row[1].strip(), dayfirst=True).date() _ = parse(row[0].strip(), dayfirst=True).date() - text = row[2] + text = row[2].replace("\n", " ") credit = row[3].replace('\'', '') debit = row[4].replace('\'', '') amount = -float(debit) if debit else float(credit)