From 04e024f87dde09be81e0c42d8b79745570d26f23 Mon Sep 17 00:00:00 2001 From: ChillarAnand Date: Tue, 28 May 2024 19:31:55 +0530 Subject: [PATCH 1/2] [FEATURE] Add support for file_encoding in sources --- ingen/data_source/file_source.py | 4 +++- ingen/reader/file_reader.py | 18 +++++++++--------- ingen/reader/xml_file_reader.py | 2 +- test/data_source/test_file_source.py | 3 +-- test/input/test_utf_16.xml | Bin 0 -> 486 bytes test/metadata/test_metadata.py | 16 +++++++++++++++- test/reader/test_xml_file_reader.py | 23 +++++++++++++++++++++++ 7 files changed, 52 insertions(+), 14 deletions(-) create mode 100644 test/input/test_utf_16.xml diff --git a/ingen/data_source/file_source.py b/ingen/data_source/file_source.py index f25841ec..9d1bf9e7 100644 --- a/ingen/data_source/file_source.py +++ b/ingen/data_source/file_source.py @@ -46,7 +46,7 @@ def fetch_data(self, reader): """ returns a DataFrame of data fetched from input FileSource. """ - return reader.read(self._src) + return reader.read(self._src, encoding=self._src['file_encoding']) def fetch_validations(self): """ @@ -64,4 +64,6 @@ def format_file_path(self, source, params_map): path_parser = PathParser(run_date) if 'file_path' in source: source['file_path'] = path_parser.parse(source['file_path']) + if not source.get('file_encoding'): + source['file_encoding'] = 'utf-8' return source diff --git a/ingen/reader/file_reader.py b/ingen/reader/file_reader.py index df8524b7..3f786565 100644 --- a/ingen/reader/file_reader.py +++ b/ingen/reader/file_reader.py @@ -101,8 +101,8 @@ def read(self, src): def get_config(src): - header_size = src.get('skip_header_size' , 0) - trailer_size = src.get('skip_trailer_size' , 0) + header_size = src.get('skip_header_size', 0) + trailer_size = src.get('skip_trailer_size', 0) all_cols = src.get('columns') return { "header_size": header_size, @@ -115,13 +115,13 @@ class ReaderFactory: @classmethod def get_reader(cls, src): - factory_types = {'delimited_file': CSVFileReader, - 'excel': ExcelFileReader, - 'xml': XMLFileReader, - 'json': JSONFileReader, - "fixed_width": FixedWidthFileReader - } + factory_types = { + 'delimited_file': CSVFileReader, + 'excel': ExcelFileReader, + 'xml': XMLFileReader, + 'json': JSONFileReader, + 'fixed_width': FixedWidthFileReader + } reader_cls = factory_types.get(src.get('file_type')) if reader_cls: return reader_cls() - diff --git a/ingen/reader/xml_file_reader.py b/ingen/reader/xml_file_reader.py index b3869ecf..99536ec6 100644 --- a/ingen/reader/xml_file_reader.py +++ b/ingen/reader/xml_file_reader.py @@ -13,7 +13,7 @@ class XMLFileReader: def read(self, src): - xml_file = open(src['file_path'], 'r') + xml_file = open(src['file_path'], 'r', encoding=src['file_encoding']) try: data = xmltodict.parse(xml_file.read()) tree = et.parse(src['file_path']) diff --git a/test/data_source/test_file_source.py b/test/data_source/test_file_source.py index c5aaa246..b37045bf 100644 --- a/test/data_source/test_file_source.py +++ b/test/data_source/test_file_source.py @@ -20,8 +20,7 @@ def setUp(self): 'file_path': 'test', 'skip_header_size': 1, 'skip_trailer_size': 1, - 'columns': ['col1', 'col2'] - + 'columns': ['col1', 'col2'], } self.params_map = {'query_params': {'table_name': 'positions'}, 'infile': {}} self.source = FileSource(self._src, self.params_map) diff --git a/test/input/test_utf_16.xml b/test/input/test_utf_16.xml new file mode 100644 index 0000000000000000000000000000000000000000..c65dea7d17bc6ef8684a4d5ff52076a0bac61876 GIT binary patch literal 486 zcma)(&1%9>5QWdW;5(#QRnS-?bXAo!MsOi1iCwrVqGF|ODk@?(KDzqdo2X>x;%5HN zoH;W&p7g4&_WIS29PKpMS{p6(r%Y|RYRc0y)}fMVN`6K))52CiG*Vx6P%mJ*>Zqaz z7$3>it?$rz;DvOq^bh-z-Y;#*J;7JS`b_ZE!h1u-gzUEx@W$524(w#F#20`xijynf z#zu40n;pz1Gqf8$MuX9ltGg)dDMr;uW4$W^5yREPOOz6& Date: Sun, 1 Sep 2024 17:25:40 +0530 Subject: [PATCH 2/2] Code cleanup --- ingen/data_source/file_source.py | 6 +++--- ingen/reader/file_reader.py | 17 ++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ingen/data_source/file_source.py b/ingen/data_source/file_source.py index 9d1bf9e7..0e8c9e29 100644 --- a/ingen/data_source/file_source.py +++ b/ingen/data_source/file_source.py @@ -31,6 +31,8 @@ def __init__(self, source, params_map): self._src = source else: self._src = self.format_file_path(source, params_map) + if not source.get('file_encoding'): + source['file_encoding'] = 'utf-8' def fetch(self): """ @@ -46,7 +48,7 @@ def fetch_data(self, reader): """ returns a DataFrame of data fetched from input FileSource. """ - return reader.read(self._src, encoding=self._src['file_encoding']) + return reader.read(self._src) def fetch_validations(self): """ @@ -64,6 +66,4 @@ def format_file_path(self, source, params_map): path_parser = PathParser(run_date) if 'file_path' in source: source['file_path'] = path_parser.parse(source['file_path']) - if not source.get('file_encoding'): - source['file_encoding'] = 'utf-8' return source diff --git a/ingen/reader/file_reader.py b/ingen/reader/file_reader.py index 3f786565..f3a9e79c 100644 --- a/ingen/reader/file_reader.py +++ b/ingen/reader/file_reader.py @@ -24,13 +24,16 @@ def read(self, src): config = get_config(src) dtype = src.get('dtype') try: - result = pd.read_csv(src['file_path'], - sep=src.get('delimiter'), - index_col=False, - skiprows=config['header_size'], - skipfooter=config['trailer_size'], - names=config['all_cols'], - dtype=dtype) + result = pd.read_csv( + src['file_path'], + sep=src.get('delimiter'), + index_col=False, + skiprows=config['header_size'], + skipfooter=config['trailer_size'], + names=config['all_cols'], + dtype=dtype, + encoding=src['file_encoding'], + ) except TypeError: logging.error(self.DTYPE_LOG_MSG) raise