diff --git a/ingen/formatters/common_formatters.py b/ingen/formatters/common_formatters.py index 97a9de7d..e82f917d 100644 --- a/ingen/formatters/common_formatters.py +++ b/ingen/formatters/common_formatters.py @@ -1,6 +1,9 @@ # Copyright (c) 2023 BlackRock, Inc. # All Rights Reserved. +from ingen.utils.properties import Properties +from ingen.utils.utils import get_business_day + import calendar import logging import re @@ -16,8 +19,7 @@ from ingen.lib.cryptor import Cryptor pd.options.mode.chained_assignment = None -from ingen.utils.properties import Properties -from ingen.utils.utils import get_business_day + log = logging.getLogger() @@ -30,7 +32,11 @@ def column_filter(dataframe, required_column_names): :return: Dataframe with only required columns in the desired order """ all_columns = list(dataframe.columns) - dataframe.drop(columns=[column for column in all_columns if column not in required_column_names]) + dataframe.drop( + columns=[ + column for column in all_columns if column not in required_column_names + ] + ) return dataframe.reindex(columns=required_column_names) @@ -68,10 +74,12 @@ def decryption_formatter(dataframe, col_id, format_options, runtime_params): def date_formatter(dataframe, col_id, date_format, runtime_params): - source_date_format = date_format['src'] - final_date_format = date_format['des'] + source_date_format = date_format["src"] + final_date_format = date_format["des"] dataframe[col_id] = pd.to_datetime(dataframe[col_id], format=source_date_format) - dataframe[col_id] = dataframe[col_id].map(lambda x: x.strftime(final_date_format) if pd.notnull(x) else '') + dataframe[col_id] = dataframe[col_id].map( + lambda x: x.strftime(final_date_format) if pd.notnull(x) else "" + ) return dataframe @@ -89,13 +97,16 @@ def constant_date_formatter(dataframe, col, format_options, runtime_params): try: date_offset, date_format, calendar_country = format_options except ValueError: - error_msg = 'Missing required formatting attributes for constant date formatter. It should be a ' \ - 'list of date_offset, date_format and calendar_country. e.g., [0, "%Y%m%d", "US"]' + error_msg = ( + "Missing required formatting attributes for constant date formatter. It should be a " + 'list of date_offset, date_format and calendar_country. e.g., [0, "%Y%m%d", "US"]' + ) log.error(error_msg) raise - dataframe[col] = get_business_day(datetime.today() + timedelta(date_offset), 'next', calendar_country).strftime( - date_format) + dataframe[col] = get_business_day( + datetime.today() + timedelta(date_offset), "next", calendar_country + ).strftime(date_format) return dataframe @@ -108,19 +119,21 @@ def concat_formatter(dataframe, col_name, config, runtime_params): :param runtime_params: command line arguments :return: dataframe with a new concatenated column """ - default_separator = '' - separator = config.get('separator', default_separator) - columns = config.get('columns') + default_separator = "" + separator = config.get("separator", default_separator) + columns = config.get("columns") if any([column not in dataframe for column in columns]): - log.error('Unknown column passed to concat formatter') + log.error("Unknown column passed to concat formatter") raise ValueError dataframe[col_name] = dataframe[columns].astype(str).agg(separator.join, axis=1) return dataframe -def duplicate_column_formatter(dataframe, duplicate_col_name, original_col_name, runtime_params): +def duplicate_column_formatter( + dataframe, duplicate_col_name, original_col_name, runtime_params +): """ Adds a duplicate column with a new name :param dataframe: original dataframe @@ -130,7 +143,7 @@ def duplicate_column_formatter(dataframe, duplicate_col_name, original_col_name, :return: dataframe with a duplicate column """ if original_col_name not in dataframe: - log.error('Cannot create duplicate of a nonexistent column') + log.error("Cannot create duplicate of a nonexistent column") raise KeyError dataframe[duplicate_col_name] = dataframe[original_col_name] @@ -147,29 +160,31 @@ def group_percentage_formatter(dataframe, col_name, format_options, runtime_para :param runtime_params: command line arguments :return: original dataframe with the new column added """ - of_col = format_options.get('of') - in_col = format_options.get('in') + of_col = format_options.get("of") + in_col = format_options.get("in") column_names = list(dataframe.columns) if of_col not in column_names or in_col not in column_names: - log.error('Unknown column names provided for calculating grouped percentage.') + log.error("Unknown column names provided for calculating grouped percentage.") raise ValueError - dataframe[col_name] = dataframe.groupby(in_col)[of_col].transform(lambda x: 100 * x / sum(x)) + dataframe[col_name] = dataframe.groupby(in_col)[of_col].transform( + lambda x: 100 * x / sum(x) + ) return dataframe def sum_value_formatter(dataframe, col_name, columns, runtime_params): """ - Adds a new column with the summation of column A and column B - :param dataframe: original dataframe - :param col_name: name of the new column - :param columns: columns which are to be added - :param runtime_params: command line arguments - :return: original dataframe with the new column added - """ + Adds a new column with the summation of column A and column B + :param dataframe: original dataframe + :param col_name: name of the new column + :param columns: columns which are to be added + :param runtime_params: command line arguments + :return: original dataframe with the new column added + """ if any([column not in dataframe for column in columns]): - log.error('Unknown column passed to concat formatter') + log.error("Unknown column passed to concat formatter") raise ValueError dataframe[col_name] = dataframe[columns[0]] + dataframe[columns[1]] return dataframe @@ -189,9 +204,12 @@ def date_diff_formatter(dataframe, col_name, format_options, runtime_params): to_date = format_options[1] date_format = format_options[2] if from_date not in dataframe or to_date not in dataframe: - raise KeyError("Column does not exist in dataframe. Check column names in the date-diff formatter config") - delta = pd.to_datetime(dataframe[from_date], format=date_format) - pd.to_datetime(dataframe[to_date], - format=date_format) + raise KeyError( + "Column does not exist in dataframe. Check column names in the date-diff formatter config" + ) + delta = pd.to_datetime(dataframe[from_date], format=date_format) - pd.to_datetime( + dataframe[to_date], format=date_format + ) dataframe[col_name] = delta.dt.days return dataframe @@ -205,53 +223,61 @@ def bucket_formatter(dataframe, col_name, config, runtime_params): :param runtime_params: command line arguments, not used in this formatter :return: dataframe with 'col_name' column modified as per the ranges """ - buckets = config.get('buckets') + buckets = config.get("buckets") bins = list(map(lambda x: float(x), buckets)) - labels = config.get('labels', False) - include_right = True if str.lower(config.get('include_right', 'true')) == 'true' else False - - categories = pd.cut(dataframe[col_name], bins=bins, labels=labels, right=include_right) + labels = config.get("labels", False) + include_right = ( + True if str.lower(config.get("include_right", "true")) == "true" else False + ) + + categories = pd.cut( + dataframe[col_name], bins=bins, labels=labels, right=include_right + ) dataframe[col_name] = categories.values.astype(str) return dataframe -def arithmetic_calculation_formatter(dataframe, col_name, format_options, runtime_params): +def arithmetic_calculation_formatter( + dataframe, col_name, format_options, runtime_params +): """ - Modifies the value of column or generates a new column with mathematically calculated value - :param dataframe: original dataframe - :param col_name: name of the new column - :param format_options : contains columns and operation - :param runtime_params: command line arguments - :return: updated dataframe with the modified column + Modifies the value of column or generates a new column with mathematically calculated value + :param dataframe: original dataframe + :param col_name: name of the new column + :param format_options : contains columns and operation + :param runtime_params: command line arguments + :return: updated dataframe with the modified column """ - columns = format_options.get('cols') - operation = format_options.get('operation') - value = format_options.get('value') + columns = format_options.get("cols") + operation = format_options.get("operation") + value = format_options.get("value") - valid_operations = ['div', 'mul', 'sub', 'add', 'abs'] + valid_operations = ["div", "mul", "sub", "add", "abs"] if operation not in valid_operations: - log.error(f'Unknown arithmetic operation. Valid operations are {valid_operations}') + log.error( + f"Unknown arithmetic operation. Valid operations are {valid_operations}" + ) raise ValueError - if operation == 'div': + if operation == "div": dataframe = divide(dataframe, col_name, columns) if value is not None: dataframe[col_name] = dataframe[col_name].div(value) - elif operation == 'mul': + elif operation == "mul": dataframe = multiply(dataframe, col_name, columns) if value is not None: dataframe[col_name] = dataframe[col_name].mul(value) - elif operation == 'add': + elif operation == "add": dataframe = addition(dataframe, col_name, columns) if value is not None: dataframe[col_name] = dataframe[col_name].add(value) - elif operation == 'sub': + elif operation == "sub": dataframe = subtract(dataframe, col_name, columns) if value is not None: dataframe[col_name] = dataframe[col_name].sub(value) - elif operation == 'abs': + elif operation == "abs": dataframe[col_name] = pd.to_numeric(dataframe[col_name]).abs() return dataframe @@ -266,7 +292,7 @@ def fill_empty_values(dataframe, col_name, format_options, runtime_params): :param runtime_params: command line arguments :return: new dataframe with NA values replaced in column 'col_name' """ - from_column = format_options.get('column') + from_column = format_options.get("column") if from_column not in dataframe.columns: raise KeyError(f"Column '{from_column}' not found.") @@ -274,7 +300,9 @@ def fill_empty_values(dataframe, col_name, format_options, runtime_params): return dataframe -def fill_empty_values_with_custom_value(dataframe, col_name, format_options, runtime_params): +def fill_empty_values_with_custom_value( + dataframe, col_name, format_options, runtime_params +): """ NaN rows are filled with constant values :param dataframe: original dataframe @@ -284,15 +312,24 @@ def fill_empty_values_with_custom_value(dataframe, col_name, format_options, run :param runtime_params: command line arguments :return: new dataframe with NA values replaced in column 'col_name' with the 'value' """ - custom_value = format_options.get('value') - condition = format_options.get('condition') + custom_value = format_options.get("value") + condition = format_options.get("condition") if not pd.Series(col_name).isin(dataframe.columns).all(): raise KeyError(f"Column '{col_name}' not found.") if condition is not None and len(condition) == 2: - dataframe[col_name] = np.where(np.logical_and(dataframe[col_name].isna(), ( - dataframe[condition.get('match_col')].str.match(pat=condition.get('pattern')))), - custom_value, dataframe[col_name]) + dataframe[col_name] = np.where( + np.logical_and( + dataframe[col_name].isna(), + ( + dataframe[condition.get("match_col")].str.match( + pat=condition.get("pattern") + ) + ), + ), + custom_value, + dataframe[col_name], + ) else: dataframe[col_name] = dataframe[col_name].fillna(value=custom_value) @@ -307,16 +344,17 @@ def business_day_formatter(dataframe, new_col_name, format_options, runtime_para :param runtime_params: command line arguments :return: dataframe with a new column called billing_date for every entry of date column. """ - date_col = format_options.get('col') - date_format = format_options.get('format') - calendar_country = format_options.get('cal') + date_col = format_options.get("col") + date_format = format_options.get("format") + calendar_country = format_options.get("cal") if date_col not in dataframe.columns: raise KeyError(f"Column '{date_col}' not found.") dataframe[date_col] = pd.to_datetime(dataframe[date_col], format=date_format) dataframe[new_col_name] = dataframe[date_col].apply( - lambda x: get_business_day(x.to_pydatetime(), 'prev', country=calendar_country)) + lambda x: get_business_day(x.to_pydatetime(), "prev", country=calendar_country) + ) return dataframe @@ -332,28 +370,33 @@ def split_column_formatter(dataframe, col_name, format_options, runtime_params): if col_name not in dataframe.columns: raise KeyError(f"Column '{col_name}' not found.") if dataframe.empty: - raise ValueError(f"Given Dataframe is Empty.") - if (dataframe[col_name] == 'na').all(): + raise ValueError("Given Dataframe is Empty.") + if (dataframe[col_name] == "na").all(): return dataframe - if dataframe[col_name].dtype == object and isinstance(dataframe.iloc[0][col_name], str): - dataframe[format_options.get('new_col_names')] = dataframe[col_name].str.split( - format_options.get('delimiter', ','), expand=True) + if dataframe[col_name].dtype == object and isinstance( + dataframe.iloc[0][col_name], str + ): + dataframe[format_options.get("new_col_names")] = dataframe[col_name].str.split( + format_options.get("delimiter", ","), expand=True + ) else: - split_df = pd.DataFrame(dataframe[col_name].tolist(), columns=format_options.get('new_col_names')) + split_df = pd.DataFrame( + dataframe[col_name].tolist(), columns=format_options.get("new_col_names") + ) dataframe = pd.concat([dataframe, split_df], axis=1) return dataframe def spacing_formatter(dataframe, col_name, format_options, runtime_params): - num_of_spaces = format_options.get('spacing') + num_of_spaces = format_options.get("spacing") if num_of_spaces < 0: - raise ValueError(f"Spaces cannot be negative") + raise ValueError("Spaces cannot be negative") if col_name not in dataframe.columns: raise KeyError(f"Column '{col_name}' not found.") if dataframe.empty: - raise ValueError(f"Given Dataframe is Empty.") + raise ValueError("Given Dataframe is Empty.") else: dataframe[col_name] = dataframe[col_name].astype("string") dataframe[col_name] = dataframe[col_name].str.ljust(num_of_spaces, " ") @@ -361,15 +404,17 @@ def spacing_formatter(dataframe, col_name, format_options, runtime_params): def add_trailing_zeros_formatter(dataframe, col_name, format_options, runtime_params): - max_chars = format_options.get('num_of_chars') + max_chars = format_options.get("num_of_chars") if max_chars < 0: - raise ValueError(f"Number of characters in a string cannot be negative") + raise ValueError("Number of characters in a string cannot be negative") if col_name not in dataframe.columns: raise KeyError(f"Column '{col_name}' not found.") if dataframe.empty: - raise ValueError(f"Given Dataframe is Empty.") + raise ValueError("Given Dataframe is Empty.") else: - dataframe[col_name] = dataframe[col_name].apply(lambda x: str(x).zfill(max_chars)) + dataframe[col_name] = dataframe[col_name].apply( + lambda x: str(x).zfill(max_chars) + ) return dataframe @@ -380,8 +425,8 @@ def last_date_of_prev_month(dataframe, col_name, format_options, runtime_params) col_name is the name of the newly added column """ - outdate_format = format_options.get('outdate_format', '%Y%m%d') - run_date = runtime_params.get('run_date') + outdate_format = format_options.get("outdate_format", "%Y%m%d") + run_date = runtime_params.get("run_date") dt_final = run_date.replace(day=1) - timedelta(days=1) dataframe[col_name] = str(dt_final.strftime(outdate_format)) return dataframe @@ -389,24 +434,24 @@ def last_date_of_prev_month(dataframe, col_name, format_options, runtime_params) def conditional_replace_formatter(dataframe, col_name, format_options, runtime_params): """ - selected rows are filled with column value mentioned - :param dataframe: original dataframe - :param col_name: column name on which the formatter is applied - :param format_options: dictionary key called value (representing - the values to fill) - :param runtime_params: command line arguments - :return: new dataframe with values replaced in column 'col_name' with the 'from_column' - """ - from_column = format_options.get('from_column') - condition = format_options.get('condition') + selected rows are filled with column value mentioned + :param dataframe: original dataframe + :param col_name: column name on which the formatter is applied + :param format_options: dictionary key called value (representing + the values to fill) + :param runtime_params: command line arguments + :return: new dataframe with values replaced in column 'col_name' with the 'from_column' + """ + from_column = format_options.get("from_column") + condition = format_options.get("condition") if from_column not in dataframe.columns: raise KeyError(f"Column '{from_column}' not found.") - match_col = dataframe[condition.get('match_col')].astype(str) - pattern = str(condition.get('pattern')) + match_col = dataframe[condition.get("match_col")].astype(str) + pattern = str(condition.get("pattern")) dataframe[col_name] = np.where( - match_col.str.match(pat=pattern), - dataframe[from_column], dataframe[col_name]) + match_col.str.match(pat=pattern), dataframe[from_column], dataframe[col_name] + ) return dataframe @@ -421,9 +466,10 @@ def replace_value(dataframe, col_name, format_options, runtime_params): """ if col_name not in dataframe.columns: raise KeyError(f"Column '{col_name}' not found.") - for from_value, to_value in zip(format_options.get('from_value'), format_options.get('to_value')): - dataframe[col_name] = dataframe[col_name].replace( - {from_value: to_value}) + for from_value, to_value in zip( + format_options.get("from_value"), format_options.get("to_value") + ): + dataframe[col_name] = dataframe[col_name].replace({from_value: to_value}) return dataframe @@ -438,10 +484,12 @@ def extract_from_pattern(dataframe, col_name, format_options, runtime_params): """ if col_name not in dataframe.columns: raise KeyError(f"Column '{col_name}' not found.") - pattern = format_options.get('pattern') + pattern = format_options.get("pattern") dataframe[col_name] = dataframe[col_name].map( - lambda x: None if len(re.findall(pattern, str(x))) == 0 else - (re.findall(pattern, str(x)))[0]) + lambda x: None + if len(re.findall(pattern, str(x))) == 0 + else (re.findall(pattern, str(x)))[0] + ) return dataframe @@ -484,7 +532,7 @@ def get_running_environment(dataframe, col_name, format_options, runtime_params) :return: new dataframe with containing constant set for the running environment on the specified column """ - env = Properties.get_property('ENV') + env = Properties.get_property("ENV") constant = format_options.get(env) dataframe[col_name] = constant @@ -493,16 +541,18 @@ def get_running_environment(dataframe, col_name, format_options, runtime_params) def runtime_date(dataframe, col_name, format_options, runtime_params): """ - create a new row with runtime date value - :param dataframe: original dataframe - :param col_name: column name on which the formatter is applied - :param format_options: dictionary key called value (representing - the values to fill) - :param runtime_params: command line arguments - :return: new dataframe with NA values replaced in column 'col_name' with the 'value' - """ - dataframe[col_name] = runtime_params['run_date'] - return date_formatter(dataframe, col_name, {'src': '%m%d%Y', 'des': format_options.get('des')}, {}) + create a new row with runtime date value + :param dataframe: original dataframe + :param col_name: column name on which the formatter is applied + :param format_options: dictionary key called value (representing + the values to fill) + :param runtime_params: command line arguments + :return: new dataframe with NA values replaced in column 'col_name' with the 'value' + """ + dataframe[col_name] = runtime_params["run_date"] + return date_formatter( + dataframe, col_name, {"src": "%m%d%Y", "des": format_options.get("des")}, {} + ) def add_uuid_col(dataframe, col_name, format_options, runtime_params): @@ -520,15 +570,15 @@ def add_uuid_col(dataframe, col_name, format_options, runtime_params): def sub_string(dataframe, col_name, format_options, runtime_params): """ - get a substring from the column value - :param dataframe: original dataframe - :param col_name: name of the new column to be added - :param format_options: represent start and end index to get the substring from the string - :param runtime_params: Not required for this formatter - :return: dataframe with a new substring value of that column - """ - start = format_options.get('start', None) - end = format_options.get('end', None) + get a substring from the column value + :param dataframe: original dataframe + :param col_name: name of the new column to be added + :param format_options: represent start and end index to get the substring from the string + :param runtime_params: Not required for this formatter + :return: dataframe with a new substring value of that column + """ + start = format_options.get("start", None) + end = format_options.get("end", None) dataframe[col_name] = dataframe[col_name].astype(str).str[start:end] return dataframe @@ -542,8 +592,10 @@ def float_precision(dataframe, col_name, format_options, runtime_params): :param runtime_params: Not required for this formatter :return: dataframe with a float point values formatted """ - precision = format_options.get('precision') - dataframe[col_name] = dataframe[col_name].map(lambda x: float(f"%.{precision}f" % x)) + precision = format_options.get("precision") + dataframe[col_name] = dataframe[col_name].map( + lambda x: float(f"%.{precision}f" % x) + ) return dataframe @@ -556,10 +608,10 @@ def drop_duplicates(dataframe, col_name, format_options, runtime_params): :param runtime_params: Not required for this formatter :return: dataframe with a float point values formatted """ - valid_vals = ['first', 'last', False] - keep_opt = format_options.get('keep') + valid_vals = ["first", "last", False] + keep_opt = format_options.get("keep") if keep_opt not in valid_vals: - keep_opt = 'first' + keep_opt = "first" return dataframe.drop_duplicates(subset=[col_name], keep=keep_opt) @@ -572,13 +624,13 @@ def prefix_string_formatter(dataframe, col_name, config, runtime_params): :param runtime_params: command line arguments :return: dataframe with a new concatenated column """ - default_separator = '' - separator = config.get('separator', default_separator) - columns = config.get('columns') - start = config.get('prefix', None) + default_separator = "" + separator = config.get("separator", default_separator) + columns = config.get("columns") + start = config.get("prefix", None) if any([column not in dataframe for column in columns]): - log.error('Unknown column passed to concat formatter') + log.error("Unknown column passed to concat formatter") raise ValueError dataframe[col_name] = dataframe[columns].astype(str).agg(separator.join, axis=1) @@ -588,55 +640,55 @@ def prefix_string_formatter(dataframe, col_name, config, runtime_params): formatter_map = { - 'date': date_formatter, - 'float': float_formatter, - 'concat': concat_formatter, - 'constant': constant_formatter, - 'constant-date': constant_date_formatter, - 'duplicate': duplicate_column_formatter, - 'decryption': decryption_formatter, - 'encryption': encryption_formatter, - 'group-percentage': group_percentage_formatter, - 'sum': sum_value_formatter, - 'date-diff': date_diff_formatter, - 'bucket': bucket_formatter, - 'arithmetic_calc': arithmetic_calculation_formatter, - 'fill_empty_values': fill_empty_values, - 'fill_empty_values_with_custom_value': fill_empty_values_with_custom_value, - 'replace_value': replace_value, - 'runtime_date': runtime_date, - 'uuid': add_uuid_col, - 'sub_string': sub_string, - 'conditional_replace_formatter': conditional_replace_formatter, - 'bus_day': business_day_formatter, - 'split_col': split_column_formatter, - 'float_precision': float_precision, - 'extract_from_pattern': extract_from_pattern, - 'index_counter': index_counter, - 'add_space': spacing_formatter, - 'add_trailing_zeros': add_trailing_zeros_formatter, - 'last_date_of_prev_month': last_date_of_prev_month, - 'current_timestamp': current_timestamp, - 'get_running_environment': get_running_environment, - 'drop_duplicates': drop_duplicates, - 'prefix_string': prefix_string_formatter, - + "date": date_formatter, + "float": float_formatter, + "concat": concat_formatter, + "constant": constant_formatter, + "constant-date": constant_date_formatter, + "duplicate": duplicate_column_formatter, + "decryption": decryption_formatter, + "encryption": encryption_formatter, + "group-percentage": group_percentage_formatter, + "sum": sum_value_formatter, + "date-diff": date_diff_formatter, + "bucket": bucket_formatter, + "arithmetic_calc": arithmetic_calculation_formatter, + "fill_empty_values": fill_empty_values, + "fill_empty_values_with_custom_value": fill_empty_values_with_custom_value, + "replace_value": replace_value, + "runtime_date": runtime_date, + "uuid": add_uuid_col, + "sub_string": sub_string, + "conditional_replace_formatter": conditional_replace_formatter, + "bus_day": business_day_formatter, + "split_col": split_column_formatter, + "float_precision": float_precision, + "extract_from_pattern": extract_from_pattern, + "index_counter": index_counter, + "add_space": spacing_formatter, + "add_trailing_zeros": add_trailing_zeros_formatter, + "last_date_of_prev_month": last_date_of_prev_month, + "current_timestamp": current_timestamp, + "get_running_environment": get_running_environment, + "drop_duplicates": drop_duplicates, + "prefix_string": prefix_string_formatter, } def get_formatter_from_type(formatter_type): """ - gets an formatter, returns formatter function + gets an formatter, returns formatter function - :param formatter_type : name/type of the formatter + :param formatter_type : name/type of the formatter """ return formatter_map.get(formatter_type) + def add_formatter(formatter_type, formatter): """ - adds a new formatter, adding for flexibility for users of ingen + adds a new formatter, adding for flexibility for users of ingen - :param formatter_type : name/type of the formatter - :param formatter : formatter function + :param formatter_type : name/type of the formatter + :param formatter : formatter function """ - return formatter_map.update( {formatter_type: formatter} ) + return formatter_map.update({formatter_type: formatter}) diff --git a/ingen/formatters/formatter.py b/ingen/formatters/formatter.py index f5b02db6..5d50aadc 100644 --- a/ingen/formatters/formatter.py +++ b/ingen/formatters/formatter.py @@ -1,7 +1,14 @@ # Copyright (c) 2023 BlackRock, Inc. # All Rights Reserved. -from ingen.formatters.common_formatters import * +import logging +import time + +from ingen.formatters.common_formatters import ( + column_filter, + get_formatter_from_type, + name_formatter, +) log = logging.getLogger() @@ -16,26 +23,37 @@ def __init__(self, df, columns, params): def map_column_id_name(self): id_name_map = {} for column in self._columns: - id_name_map[column['src_col_name']] = column['dest_col_name'] if 'dest_col_name' in column else column[ - 'src_col_name'] + id_name_map[column["src_col_name"]] = ( + column["dest_col_name"] + if "dest_col_name" in column + else column["src_col_name"] + ) return id_name_map def apply_format(self): for column in self._columns: - for formatter in column.get('formatters', []): - formatter_func = get_formatter_from_type(formatter['type']) - col_name = column.get('src_col_name') + for formatter in column.get("formatters", []): + formatter_func = get_formatter_from_type(formatter["type"]) + col_name = column.get("src_col_name") if formatter_func is None: - raise ValueError(f"Invalid formatter type: {formatter.get('type')} " - f"on column {col_name}") - log.info(f"Formatting column {col_name} using {formatter.get('type')} formatter") + raise ValueError( + f"Invalid formatter type: {formatter.get('type')} " + f"on column {col_name}" + ) + log.info( + f"Formatting column {col_name} using {formatter.get('type')} formatter" + ) start = time.time() - self._df = formatter_func(self._df, col_name, formatter.get('format'), self._param) + self._df = formatter_func( + self._df, col_name, formatter.get("format"), self._param + ) end = time.time() - log.info(f"Finished '{formatter.get('type')}' formatter on column {col_name} " - f"in {end - start:.2f} seconds") + log.info( + f"Finished '{formatter.get('type')}' formatter on column {col_name} " + f"in {end - start:.2f} seconds" + ) - column_names = [col['src_col_name'] for col in self._columns] + column_names = [col["src_col_name"] for col in self._columns] self._df = column_filter(self._df, column_names) self._df = name_formatter(self._df, self._id_name_map) return self._df diff --git a/ingen/formatters/utils.py b/ingen/formatters/utils.py index e8870d5a..c6bb2310 100644 --- a/ingen/formatters/utils.py +++ b/ingen/formatters/utils.py @@ -1,6 +1,7 @@ # Copyright (c) 2023 BlackRock, Inc. # All Rights Reserved. + def addition(dataframe, col_name, columns): if columns is None or len(columns) < 2: return dataframe diff --git a/ingen/generators/interface_generator.py b/ingen/generators/interface_generator.py index 96feeb2f..7119c826 100644 --- a/ingen/generators/interface_generator.py +++ b/ingen/generators/interface_generator.py @@ -45,11 +45,11 @@ def validate(self, df, columns, data=None, sources=None): validation_summaries = [] validated_dataframe = None if sources is None: - log.info(f"Starting validations on Formatted data") + log.info("Starting validations on Formatted data") self.validations = Validation(df, columns, data=data) validated_dataframe, validation_summary = self.validations.apply_validations() validation_summaries.append(validation_summary) - log.info(f" Finished validations on Formatted data") + log.info(" Finished validations on Formatted data") else: for source in sources: validation_list = source.fetch_validations() diff --git a/ingen/reader/data_source.py b/ingen/reader/data_source.py index 37d6f6f7..df162235 100644 --- a/ingen/reader/data_source.py +++ b/ingen/reader/data_source.py @@ -18,7 +18,7 @@ def get_connection(self): self.__connection = connection.MySQLConnection(host=self.__host, user=self.__user, password=self.__passwd, database=self.__database) if not self.__connection else self.__connection return self.__connection - except Error as e: + except Error: raise RuntimeError("Not able to establish connection with this database.") def get_cursor(self): diff --git a/ingen/utils/sql_query_parser.py b/ingen/utils/sql_query_parser.py index 05423dea..e4a0bcac 100644 --- a/ingen/utils/sql_query_parser.py +++ b/ingen/utils/sql_query_parser.py @@ -88,7 +88,7 @@ def insert_values(cls, temp_table_config): file_col = key['file_col'] col_size = f"({key.get('size')})" if 'size' in key else '' separtaor = '' if key == temp_table_cols[-1] else ',' - col_config += col_name + f" " + col_type + col_size + separtaor + col_config += col_name + " " + col_type + col_size + separtaor col_list += col_name + separtaor file_cols.append(file_col) default_val = cls.fill_empty_values(key, col_type) diff --git a/ingen/validation/notification.py b/ingen/validation/notification.py index 175e040c..e70d44c9 100644 --- a/ingen/validation/notification.py +++ b/ingen/validation/notification.py @@ -56,5 +56,5 @@ def email_attributes(params, validation_action, validation_summary): send_email(validation_action_to_address, email_body, subject) if "blocker" in str(validation_summary): raise ValueError( - f"Error while Validating interface file for the columns having severity as blocker" + "Error while Validating interface file for the columns having severity as blocker" ) diff --git a/test/formatters/test_utils.py b/test/formatters/test_utils.py index 617b7574..8583c794 100644 --- a/test/formatters/test_utils.py +++ b/test/formatters/test_utils.py @@ -17,7 +17,7 @@ def test_addition_function_with_multiple_rows(self): 'weight4': [56, 13, 22] }) col_name = 'sum' - columns = ['weight1', 'weight2', 'weight3', 'weight4']; + columns = ['weight1', 'weight2', 'weight3', 'weight4'] expected_data = sample_data.copy() expected_data[col_name] = [106, 90, 121] formatted_data = addition(sample_data, col_name, columns) @@ -43,7 +43,7 @@ def test_subtraction_function_with_multiple_rows(self): 'weight4': [2, 5, 10] }) col_name = 'sub' - columns = ['weight1', 'weight2', 'weight3', 'weight4']; + columns = ['weight1', 'weight2', 'weight3', 'weight4'] expected_data = sample_data.copy() expected_data[col_name] = [10, 10, 10] formatted_data = subtract(sample_data, col_name, columns) @@ -69,7 +69,7 @@ def test_divide_function_with_multiple_rows(self): 'weight4': [2, 2, 2] }) col_name = 'div' - columns = ['weight1', 'weight2', 'weight3', 'weight4']; + columns = ['weight1', 'weight2', 'weight3', 'weight4'] expected_data = sample_data.copy() expected_data[col_name] = [16.0, 4.0, 8.0] formatted_data = divide(sample_data, col_name, columns) @@ -95,7 +95,7 @@ def test_multiply_function_with_multiple_rows(self): 'weight4': [2, 2, 2] }) col_name = 'mul' - columns = ['weight1', 'weight2', 'weight3', 'weight4']; + columns = ['weight1', 'weight2', 'weight3', 'weight4'] expected_data = sample_data.copy() expected_data[col_name] = [128, 32, 64] formatted_data = multiply(sample_data, col_name, columns) diff --git a/test/metadata/test_metadata.py b/test/metadata/test_metadata.py index 7d7f1c19..86cb73b6 100644 --- a/test/metadata/test_metadata.py +++ b/test/metadata/test_metadata.py @@ -162,7 +162,7 @@ def test_path_without_date(self): output = metadata.output - expected_path = f"/some/path/name.csv" + expected_path = "/some/path/name.csv" self.assertEqual(expected_path, output.get("props").get("path")) def test_metadata_validation_action(self): diff --git a/test/reader/test_xml_file_reader.py b/test/reader/test_xml_file_reader.py index 592f37c8..60ffc860 100644 --- a/test/reader/test_xml_file_reader.py +++ b/test/reader/test_xml_file_reader.py @@ -4,7 +4,6 @@ import unittest from pathlib import Path from pyexpat import ExpatError -from typing import Dict, Union, List from unittest.mock import patch import pandas as pd