-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFillDoc.py
More file actions
91 lines (73 loc) · 3.6 KB
/
FillDoc.py
File metadata and controls
91 lines (73 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import docx
import fitz # PyMuPDF
import pytesseract
import pandas as pd
import streamlit as st
from PIL import Image
# Assure que le dossier processed_files existe
SAVE_DIR = "processed_files"
os.makedirs(SAVE_DIR, exist_ok=True)
# Configuration du chemin Tesseract (Windows)
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe" # Modifie selon ton installation
st.title("📜 LexExtract - Extraction et Remplissage de Contrats")
def extract_text_from_pdf(pdf_file):
""" Extrait le texte d'un fichier PDF. """
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = "\n".join([page.get_text("text") for page in doc])
return text
def extract_text_from_image(image_file):
""" Extrait le texte d'une image (JPG, PNG) en utilisant Tesseract OCR. """
image = Image.open(image_file)
text = pytesseract.image_to_string(image, lang="eng+fra") # Support anglais + français
return text
def extract_text_from_txt(txt_file):
""" Extrait le texte d'un fichier texte. """
return txt_file.read().decode("utf-8")
def extract_text_from_docx(docx_file):
""" Extrait le texte d'un fichier DOCX. """
doc = docx.Document(docx_file)
return "\n".join([para.text for para in doc.paragraphs])
def save_to_docx(text, filename):
""" Sauvegarde le texte extrait dans un fichier DOCX. """
doc = docx.Document()
doc.add_paragraph(text)
filepath = os.path.join(SAVE_DIR, f"{filename}.docx")
doc.save(filepath)
return filepath
def fill_contract_template(text, template_path, output_path):
""" Remplit un fichier DOCX modèle avec les informations extraites. """
doc = docx.Document(template_path)
for para in doc.paragraphs:
if "[Date]" in para.text:
para.text = para.text.replace("[Date]", "01/01/2025") # Exemple de remplissage
if "[Nom]" in para.text:
para.text = para.text.replace("[Nom]", "Entreprise XYZ")
doc.save(output_path)
# Interface Streamlit
uploaded_file = st.file_uploader("📂 Upload un fichier", type=["txt", "pdf", "docx", "jpg", "jpeg", "png"])
template_path = "template.docx" # Chemin du modèle de contrat
output_path = os.path.join(SAVE_DIR, "contrat_rempli.docx")
if uploaded_file is not None:
file_extension = uploaded_file.name.split(".")[-1].lower()
extracted_text = ""
if file_extension == "pdf":
extracted_text = extract_text_from_pdf(uploaded_file)
elif file_extension in ["jpg", "jpeg", "png"]:
extracted_text = extract_text_from_image(uploaded_file)
elif file_extension == "txt":
extracted_text = extract_text_from_txt(uploaded_file)
elif file_extension == "docx":
extracted_text = extract_text_from_docx(uploaded_file)
st.text_area("📖 Texte extrait", extracted_text, height=300)
fill_contract_template(extracted_text, template_path, output_path)
st.success(f"✅ Contrat rempli sauvegardé : {output_path}")
with open(output_path, "rb") as f:
st.download_button("📥 Télécharger le contrat rempli", f, file_name="contrat_rempli.docx")
# Affichage des fichiers récemment traités
st.sidebar.title("📂 Fichiers traités récemment")
processed_files = sorted(os.listdir(SAVE_DIR), key=lambda f: os.path.getmtime(os.path.join(SAVE_DIR, f)), reverse=True)
for file in processed_files[:5]: # Afficher les 5 fichiers les plus récents
file_path = os.path.join(SAVE_DIR, file)
with open(file_path, "rb") as f:
st.sidebar.download_button(f"📄 {file}", f, file_name=file)