Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 159 additions & 5 deletions src/file_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,45 @@
from PIL import Image
import pytesseract
from bs4 import BeautifulSoup
from datetime import datetime

# We need to set the tesseract cmd to the exe file for tesseract
# Instructions for installing tesseract located here: https://github.com/UB-Mannheim/tesseract/wiki
pytesseract.pytesseract.tesseract_cmd =r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def infer_datetime_format(date_str):
possible_formats = [
'%Y-%m-%d %H:%M:%S', # Standard format example: '2023-02-28 14:30:00'
'%Y-%m-%d %H:%M', # Without seconds: '2023-02-28 14:30'
'%Y-%m-%d %H:%M:%S.%f', # With microseconds: '2023-02-28 14:30:00.000000'
'%Y-%m-%d', # Date only: '2023-02-28'
'%d-%m-%Y %H:%M:%S', # European style with day and month swapped: '28-02-2023 14:30:00'
'%d-%m-%Y %H:%M', # European style without seconds: '28-02-2023 14:30'
'%d-%m-%Y', # European style date only: '28-02-2023'
'%m-%d-%Y %H:%M:%S', # Month-day-year format: '02-28-2023 14:30:00'
'%m-%d-%Y %H:%M', # Month-day-year format without seconds: '02-28-2023 14:30'
'%m-%d-%Y', # Month-day-year format date only: '02-28-2023'
'%Y/%m/%d %H:%M:%S', # Standard format with slashes: '2023/02/28 14:30:00'
'%Y/%m/%d %H:%M', # Without seconds with slashes: '2023/02/28 14:30'
'%Y/%m/%d %H:%M:%S.%f', # With microseconds with slashes: '2023/02/28 14:30:00.000000'
'%Y/%m/%d', # Date only with slashes: '2023/02/28'
'%d/%m/%Y %H:%M:%S', # European style with day and month swapped and slashes: '28/02/2023 14:30:00'
'%d/%m/%Y %H:%M', # European style without seconds and slashes: '28/02/2023 14:30'
'%d/%m/%Y', # European style date only with slashes: '28/02/2023'
'%m/%d/%Y %H:%M:%S', # Month-day-year format with slashes: '02/28/2023 14:30:00'
'%m/%d/%Y %H:%M', # Month-day-year format without seconds with slashes: '02/28/2023 14:30'
'%m/%d/%Y', # Month-day-year format date only with slashes: '02/28/2023'
]

for date_format in possible_formats:
try:
datetime.strptime(date_str, date_format)
return date_format
except ValueError:
continue

# If no format matches
raise ValueError("Could not infer datetime format from input string")

def read_file(file_path):
extension = os.path.splitext(file_path)[1].lower()
Expand Down Expand Up @@ -39,6 +74,7 @@ def read_text_file(file_path):
print(para + "\n")

return text

def read_pdf_file(file_path):
text = []
with open(file_path, 'rb') as file:
Expand All @@ -59,13 +95,132 @@ def read_docx_file(file_path):
text.append(para.text)
return text

def read_excel(file_path):
return pd.read_excel(file_path)

def read_csv(file_path):
return pd.read_csv(file_path)

# Functions to compute statistics for different datatypes
def get_numeric_stats(col_data):
return {
'mean': col_data.mean(),
'median': col_data.median(),
'std': col_data.std(),
'min': col_data.min(),
'max': col_data.max(),
'25th_percentile': col_data.quantile(0.25),
'75th_percentile': col_data.quantile(0.75),
'sum': col_data.sum(),
'count': col_data.count(),
'variance': col_data.var()
}

def get_string_stats(col_data):
return {
'mode': col_data.mode()[0] if not col_data.mode().empty else None,
'unique_count': col_data.nunique(),
'total_length': col_data.str.len().sum(),
'mean_length': col_data.str.len().mean(),
'max_length': col_data.str.len().max(),
'min_length': col_data.str.len().min(),
'most_frequent': col_data.value_counts().idxmax() if not col_data.value_counts().empty else None,
'least_frequent': col_data.value_counts().idxmin() if not col_data.value_counts().empty else None,
'frequency_distribution': col_data.value_counts().to_dict(),
'unique_values': col_data.unique()
}

def get_datetime_stats(col_data):
return {
'min_date': col_data.min(),
'max_date': col_data.max(),
'range': col_data.max() - col_data.min(),
'median_date': col_data.median(),
'start_year': col_data.min().year,
'end_year': col_data.max().year,
'start_month': col_data.min().month,
'end_month': col_data.max().month,
'start_day': col_data.min().day,
'end_day': col_data.max().day
}

def get_boolean_stats(col_data):
# Ensure col_data is treated as a numeric array
col_data_numeric = col_data.astype(int)
return {
'count_true': col_data.sum(),
'count_false': col_data.count() - col_data.sum(),
'percent_true': col_data.mean() * 100,
'percent_false': (1 - col_data.mean()) * 100,
'unique_count': col_data.nunique(),
'mode': col_data.mode()[0] if not col_data.mode().empty else None,
'most_frequent': col_data.value_counts().idxmax() if not col_data.value_counts().empty else None,
'least_frequent': col_data.value_counts().idxmin() if not col_data.value_counts().empty else None,
'frequency_distribution': col_data.value_counts().to_dict()
}

def read_spreadsheet(file_path):
if file_path.endswith('.xlsx'):
df = pd.read_excel(file_path)
df = read_excel(file_path)
else:
df = pd.read_csv(file_path)
return df.to_string()
df = read_csv(file_path)

stats = []
for column in df.columns:
col_data = df[column]

# Check if the column is boolean (including strings 'True'/'False' or 'true'/'false')
if _is_boolean_dtype(col_data):
col_stats = get_boolean_stats(col_data)
col_type = col_data.dtype

# Check if the column is datetime (including strings that can be parsed as datetime)
elif _is_datetime_dtype(col_data):
col_data = pd.to_datetime(col_data, errors='coerce', infer_datetime_format=True) # Convert to datetime, coerce errors
col_data = col_data.dropna() # Drop NaT (Not a Time) values after conversion

col_stats = get_datetime_stats(col_data)
col_type = col_data.dtype

# Check if the column is numeric
elif pd.api.types.is_numeric_dtype(col_data):
col_stats = get_numeric_stats(col_data)
col_type = col_data.dtype

# Default to string type if none of the above matched
else:
col_stats = get_string_stats(col_data)
col_type = col_data.dtype

for stat_name, stat_value in col_stats.items():
stat_str = (f"File: {os.path.basename(file_path)}, "
f"Column: {column}, "
f"Type: {col_type}, "
f"{stat_name}: {stat_value}")
stats.append(stat_str)

return stats

def _is_boolean_dtype(col_data):
if col_data.dtype.name == 'bool':
return True
elif pd.api.types.is_string_dtype(col_data):
# Check if all string values can be interpreted as boolean
return col_data.apply(lambda x: x.lower() in ['true', 'false']).all()
else:
return False

def _is_datetime_dtype(col_data):
if pd.api.types.is_string_dtype(col_data):
# Attempt to parse strings as datetime, handling multiple formats
for date_str in col_data:
try:
infer_datetime_format(date_str)
except ValueError:
return False
return True
else:
return False

def read_image_file(file_path):
image = Image.open(file_path)
Expand All @@ -89,4 +244,3 @@ def read_code_file(file_path):
print(block)

return blocks

4 changes: 3 additions & 1 deletion src/prompt_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def init_model(model_name):

def generate_prompt(context):
prompt = (
"You are a highly accurate text extraction system which will extract questions and answers from completely unstructured data. Your task is to extract the user prompts, and corresponding responses from the given context. Ensure the extracted data follows the exact format given below:\n\n"
"You are a highly accurate text extraction system which will extract questions and answers from completely unstructured data. Your task is to extract user prompts, and corresponding responses from the given unstructured context.\n"
"When processing statistical values from spreadsheets, ensure that you include all details about the data provided to you in a single question (ex: What is the datatype of the integers column in test_data.csv: int64, or 'what is the mean of the prices column in shopping.csv: 60.54).\n"
"Ensure the extracted data follows the exact format given below:\n\n"
"Format:\n"
"User Prompt: [question]\n"
"Response: [response]\n\n"
Expand Down