diff --git a/src/file_ingest.py b/src/file_ingest.py index 46d8d7f..83755f7 100644 --- a/src/file_ingest.py +++ b/src/file_ingest.py @@ -6,10 +6,45 @@ from PIL import Image import pytesseract from bs4 import BeautifulSoup +from datetime import datetime # We need to set the tesseract cmd to the exe file for tesseract # Instructions for installing tesseract located here: https://github.com/UB-Mannheim/tesseract/wiki -pytesseract.pytesseract.tesseract_cmd =r'C:\Program Files\Tesseract-OCR\tesseract.exe' +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +def infer_datetime_format(date_str): + possible_formats = [ + '%Y-%m-%d %H:%M:%S', # Standard format example: '2023-02-28 14:30:00' + '%Y-%m-%d %H:%M', # Without seconds: '2023-02-28 14:30' + '%Y-%m-%d %H:%M:%S.%f', # With microseconds: '2023-02-28 14:30:00.000000' + '%Y-%m-%d', # Date only: '2023-02-28' + '%d-%m-%Y %H:%M:%S', # European style with day and month swapped: '28-02-2023 14:30:00' + '%d-%m-%Y %H:%M', # European style without seconds: '28-02-2023 14:30' + '%d-%m-%Y', # European style date only: '28-02-2023' + '%m-%d-%Y %H:%M:%S', # Month-day-year format: '02-28-2023 14:30:00' + '%m-%d-%Y %H:%M', # Month-day-year format without seconds: '02-28-2023 14:30' + '%m-%d-%Y', # Month-day-year format date only: '02-28-2023' + '%Y/%m/%d %H:%M:%S', # Standard format with slashes: '2023/02/28 14:30:00' + '%Y/%m/%d %H:%M', # Without seconds with slashes: '2023/02/28 14:30' + '%Y/%m/%d %H:%M:%S.%f', # With microseconds with slashes: '2023/02/28 14:30:00.000000' + '%Y/%m/%d', # Date only with slashes: '2023/02/28' + '%d/%m/%Y %H:%M:%S', # European style with day and month swapped and slashes: '28/02/2023 14:30:00' + '%d/%m/%Y %H:%M', # European style without seconds and slashes: '28/02/2023 14:30' + '%d/%m/%Y', # European style date only with slashes: '28/02/2023' + '%m/%d/%Y %H:%M:%S', # Month-day-year format with slashes: '02/28/2023 14:30:00' + '%m/%d/%Y %H:%M', # Month-day-year format without seconds with slashes: '02/28/2023 14:30' + '%m/%d/%Y', # Month-day-year format date only with slashes: '02/28/2023' + ] + + for date_format in possible_formats: + try: + datetime.strptime(date_str, date_format) + return date_format + except ValueError: + continue + + # If no format matches + raise ValueError("Could not infer datetime format from input string") def read_file(file_path): extension = os.path.splitext(file_path)[1].lower() @@ -39,6 +74,7 @@ def read_text_file(file_path): print(para + "\n") return text + def read_pdf_file(file_path): text = [] with open(file_path, 'rb') as file: @@ -59,13 +95,132 @@ def read_docx_file(file_path): text.append(para.text) return text +def read_excel(file_path): + return pd.read_excel(file_path) + +def read_csv(file_path): + return pd.read_csv(file_path) + +# Functions to compute statistics for different datatypes +def get_numeric_stats(col_data): + return { + 'mean': col_data.mean(), + 'median': col_data.median(), + 'std': col_data.std(), + 'min': col_data.min(), + 'max': col_data.max(), + '25th_percentile': col_data.quantile(0.25), + '75th_percentile': col_data.quantile(0.75), + 'sum': col_data.sum(), + 'count': col_data.count(), + 'variance': col_data.var() + } + +def get_string_stats(col_data): + return { + 'mode': col_data.mode()[0] if not col_data.mode().empty else None, + 'unique_count': col_data.nunique(), + 'total_length': col_data.str.len().sum(), + 'mean_length': col_data.str.len().mean(), + 'max_length': col_data.str.len().max(), + 'min_length': col_data.str.len().min(), + 'most_frequent': col_data.value_counts().idxmax() if not col_data.value_counts().empty else None, + 'least_frequent': col_data.value_counts().idxmin() if not col_data.value_counts().empty else None, + 'frequency_distribution': col_data.value_counts().to_dict(), + 'unique_values': col_data.unique() + } + +def get_datetime_stats(col_data): + return { + 'min_date': col_data.min(), + 'max_date': col_data.max(), + 'range': col_data.max() - col_data.min(), + 'median_date': col_data.median(), + 'start_year': col_data.min().year, + 'end_year': col_data.max().year, + 'start_month': col_data.min().month, + 'end_month': col_data.max().month, + 'start_day': col_data.min().day, + 'end_day': col_data.max().day + } + +def get_boolean_stats(col_data): + # Ensure col_data is treated as a numeric array + col_data_numeric = col_data.astype(int) + return { + 'count_true': col_data.sum(), + 'count_false': col_data.count() - col_data.sum(), + 'percent_true': col_data.mean() * 100, + 'percent_false': (1 - col_data.mean()) * 100, + 'unique_count': col_data.nunique(), + 'mode': col_data.mode()[0] if not col_data.mode().empty else None, + 'most_frequent': col_data.value_counts().idxmax() if not col_data.value_counts().empty else None, + 'least_frequent': col_data.value_counts().idxmin() if not col_data.value_counts().empty else None, + 'frequency_distribution': col_data.value_counts().to_dict() + } def read_spreadsheet(file_path): if file_path.endswith('.xlsx'): - df = pd.read_excel(file_path) + df = read_excel(file_path) else: - df = pd.read_csv(file_path) - return df.to_string() + df = read_csv(file_path) + + stats = [] + for column in df.columns: + col_data = df[column] + + # Check if the column is boolean (including strings 'True'/'False' or 'true'/'false') + if _is_boolean_dtype(col_data): + col_stats = get_boolean_stats(col_data) + col_type = col_data.dtype + + # Check if the column is datetime (including strings that can be parsed as datetime) + elif _is_datetime_dtype(col_data): + col_data = pd.to_datetime(col_data, errors='coerce', infer_datetime_format=True) # Convert to datetime, coerce errors + col_data = col_data.dropna() # Drop NaT (Not a Time) values after conversion + + col_stats = get_datetime_stats(col_data) + col_type = col_data.dtype + + # Check if the column is numeric + elif pd.api.types.is_numeric_dtype(col_data): + col_stats = get_numeric_stats(col_data) + col_type = col_data.dtype + + # Default to string type if none of the above matched + else: + col_stats = get_string_stats(col_data) + col_type = col_data.dtype + + for stat_name, stat_value in col_stats.items(): + stat_str = (f"File: {os.path.basename(file_path)}, " + f"Column: {column}, " + f"Type: {col_type}, " + f"{stat_name}: {stat_value}") + stats.append(stat_str) + + return stats + +def _is_boolean_dtype(col_data): + if col_data.dtype.name == 'bool': + return True + elif pd.api.types.is_string_dtype(col_data): + # Check if all string values can be interpreted as boolean + return col_data.apply(lambda x: x.lower() in ['true', 'false']).all() + else: + return False + +def _is_datetime_dtype(col_data): + if pd.api.types.is_string_dtype(col_data): + # Attempt to parse strings as datetime, handling multiple formats + for date_str in col_data: + try: + infer_datetime_format(date_str) + except ValueError: + return False + return True + else: + return False def read_image_file(file_path): image = Image.open(file_path) @@ -89,4 +244,3 @@ def read_code_file(file_path): print(block) return blocks - diff --git a/src/prompt_extractor.py b/src/prompt_extractor.py index ce34d5c..fa78666 100644 --- a/src/prompt_extractor.py +++ b/src/prompt_extractor.py @@ -10,7 +10,9 @@ def init_model(model_name): def generate_prompt(context): prompt = ( - "You are a highly accurate text extraction system which will extract questions and answers from completely unstructured data. Your task is to extract the user prompts, and corresponding responses from the given context. Ensure the extracted data follows the exact format given below:\n\n" + "You are a highly accurate text extraction system which will extract questions and answers from completely unstructured data. Your task is to extract user prompts, and corresponding responses from the given unstructured context.\n" + "When processing statistical values from spreadsheets, ensure that you include all details about the data provided to you in a single question (ex: What is the datatype of the integers column in test_data.csv: int64, or 'what is the mean of the prices column in shopping.csv: 60.54).\n" + "Ensure the extracted data follows the exact format given below:\n\n" "Format:\n" "User Prompt: [question]\n" "Response: [response]\n\n"