From 0b8d0739c2628775c63e0f3d1a5a3fdf57d7b83e Mon Sep 17 00:00:00 2001 From: tomseimandi Date: Fri, 12 Apr 2024 07:57:27 -0400 Subject: [PATCH 1/2] Add custom INPI authentication --- app/pages/Nouvelle_extraction.py | 367 ++++++++++++++++--------------- app/streamlit_utils.py | 20 ++ 2 files changed, 205 insertions(+), 182 deletions(-) diff --git a/app/pages/Nouvelle_extraction.py b/app/pages/Nouvelle_extraction.py index fb368d7..994b10e 100644 --- a/app/pages/Nouvelle_extraction.py +++ b/app/pages/Nouvelle_extraction.py @@ -38,211 +38,214 @@ # Initialize cached resources fs = get_file_system() -# Document querier - requires user name and password -document_querier = DocumentQuerier( - os.environ["TEST_INPI_USERNAME"], os.environ["TEST_INPI_PASSWORD"] -) -# Allow users to input year +# Input year year = st.text_area( - label="Entrez l'année pour laquelle vous souhaitez vérifier " - "la disponibilité du document", + label="Entrez l'année du document souhaité.", value="2021", max_chars=4, ) -# Allow users to input multiple document IDs -company_ids = st.text_area("Entrez les numéros Siren (séparés d'un espace):") -# Split the user input into a list of document IDs -company_ids = company_ids.split() +# Input Siren +company_id = st.text_area("Entrez un numéro Siren:") # Add a button to check availability for all specified documents -dispo_button = st.button("Vérifier la disponibilité") -if not st.session_state.get("button"): - st.session_state["button"] = dispo_button - -if st.session_state["button"]: - try: - year = int(year) - except ValueError: - st.error("Année non valide.") - - if isinstance(year, int): - for company_id in company_ids: - if not check_siren_length(company_id): - st.error( - f"Le numéro Siren {company_id} ne contient " f"pas 9 caractères." - ) - else: - availability, document_id = check_availability( - document_querier, company_id, year +if not st.session_state.inpi_auth: + st.write( + "Vous n'êtes pas authentifié auprès le l'INPI. " + "Renseigner des identifiants valides à gauche." + ) +else: + document_querier = DocumentQuerier( + username=st.session_state.inpi_credentials.get("username"), + password=st.session_state.inpi_credentials.get("password"), + ) + + # Button to chck availability + dispo_button = st.button("Vérifier la disponibilité") + if not st.session_state.get("button"): + st.session_state["button"] = dispo_button + + if st.session_state["button"]: + try: + year = int(year) + except ValueError: + st.error("Année non valide.") + + if not check_siren_length(company_id): + st.error( + f"Le numéro Siren {company_id} ne contient pas 9 caractères." + ) + else: + availability, document_id = check_availability( + document_querier, company_id, year + ) + + if availability: + file_name = f"CA_{company_id}_{year}.pdf" + # Display the availability status for each document + st.write(f"Document disponible pour le Siren {company_id}.") + + PDFbyte = download_pdf(document_querier, document_id) + st.download_button( + label="Comptes annuels", + data=PDFbyte, + file_name=file_name, + mime="application/octet-stream", ) - if availability: - file_name = f"CA_{company_id}_{year}.pdf" - # Display the availability status for each document - st.write(f"Document disponible pour le " f"Siren {company_id}.") - - PDFbyte = download_pdf(document_querier, document_id) - st.download_button( - label="Comptes annuels", - data=PDFbyte, - file_name=file_name, - mime="application/octet-stream", + selection_button = st.button( + "Identification de la page d'intérêt", + key=f"page_selection_btn_{company_id}_{year}", + ) + if not st.session_state.get( + f"selection_button_{company_id}_{year}" + ): + st.session_state[f"selection_button_{company_id}_{year}"] = ( + selection_button ) + if st.session_state[f"selection_button_{company_id}_{year}"]: + try: + s3_path = os.path.join( + PDF_SAMPLES_PATH, f"{company_id}_{year}.pdf" + ) + # Check if selected page file is already persisted + if fs.exists(s3_path): + document = read_pdf_from_s3(fs, s3_path) + # Else run page selection and persist the selected page + else: + document = fitz.open(stream=PDFbyte, filetype="pdf") + # TODO: There can be multiple pages sometimes + # TODO: implement this possibility + page_selection_url = ( + "https://extraction-cs.lab.sspcloud.fr/select_page" + ) + files = {"pdf_file": document.tobytes()} + response = requests.post( + url=page_selection_url, files=files + ) + # TODO: handle errors using result field + page_number = response.json()["page_number"] + st.write( + f"Un tableau filiales et participations a été " + f"repéré à la page {page_number + 1}." + ) + document.select([page_number]) + # Save to persistent storage + upload_pdf_to_s3( + document=document, fs=fs, s3_path=s3_path + ) - selection_button = st.button( - "Identification de la page d'intérêt", - key=f"page_selection_btn_{company_id}_{year}", - ) - if not st.session_state.get( - f"selection_button_{company_id}_{year}" - ): - st.session_state[f"selection_button_{company_id}_{year}"] = ( - selection_button + table_transformer_tab, extract_table_tab = st.tabs( + ["Table transformer", "Site ExtractTable"] ) - if st.session_state[f"selection_button_{company_id}_{year}"]: - try: - s3_path = os.path.join( - PDF_SAMPLES_PATH, f"{company_id}_{year}.pdf" + + # Extraction + with table_transformer_tab: + extraction_button = st.button( + "Extraction des tableaux", + key=f"extraction_btn_{company_id}_{year}", ) - # Check if selected page file is already persisted - if fs.exists(s3_path): - document = read_pdf_from_s3(fs, s3_path) - # Else run page selection and persist the selected page - else: - document = fitz.open(stream=PDFbyte, filetype="pdf") - # TODO: There can be multiple pages sometimes - # TODO: implement this possibility - page_selection_url = ( - "https://extraction-cs.lab.sspcloud.fr/select_page" - ) - files = {"pdf_file": document.tobytes()} - response = requests.post( - url=page_selection_url, files=files - ) - # TODO: handle errors using result field - page_number = response.json()["page_number"] - st.write( - f"Un tableau filiales et participations a été " - f"repéré à la page {page_number + 1}." - ) - document.select([page_number]) - # Save to persistent storage - upload_pdf_to_s3( - document=document, fs=fs, s3_path=s3_path + text_placeholder = st.empty() + if not st.session_state.get( + f"extraction_btn_{company_id}_{year}_state" + ): + st.session_state[ + f"extraction_btn_{company_id}_{year}_state" + ] = extraction_button + if st.session_state[ + f"extraction_btn_{company_id}_{year}_state" + ]: + extraction_s3_path = os.path.join( + TABLE_TRANSFORMER_EXTRACTIONS_PATH, + f"{company_id}_{year}", ) + if fs.exists(extraction_s3_path): + text_placeholder.write( + "L'extraction existe déjà: " + "accédez-y grâce à l'onglet 'Extractions disponibles'." + ) + else: + text_placeholder.write("Extraction en cours...") + # Table extraction + table_transformer_output = ( + extract_tables_transformer(document) + ) + for table_idx, df in enumerate( + table_transformer_output + ): + # Save to persistent storage + with fs.open( + os.path.join( + extraction_s3_path, + f"table_{table_idx}.csv", + ), + "wb", + ) as f: + df.to_csv(f) + text_placeholder.write( + f"Extraction de {len(table_transformer_output)} effectuée: " + f"accédez-y grâce à l'onglet 'Extractions disponibles'." + ) - table_transformer_tab, extract_table_tab = st.tabs( - ["Table transformer", "Site ExtractTable"] + with extract_table_tab: + # ExtractTable extraction + extract_table_button = st.button( + "Extraction des tableaux", + key=f"extract_table_btn_{company_id}_{year}", ) - - # Extraction - with table_transformer_tab: - extraction_button = st.button( - "Extraction des tableaux", - key=f"extraction_btn_{company_id}_{year}", + text_placeholder = st.empty() + if not st.session_state.get( + f"extract_table_btn_{company_id}_{year}_state" + ): + st.session_state[ + f"extract_table_btn_{company_id}_{year}_state" + ] = extract_table_button + if st.session_state[ + f"extract_table_btn_{company_id}_{year}_state" + ]: + extract_table_s3_path = os.path.join( + EXTRACT_TABLE_EXTRACTIONS_PATH, + f"{company_id}_{year}", ) - text_placeholder = st.empty() - if not st.session_state.get( - f"extraction_btn_{company_id}_{year}_state" - ): - st.session_state[ - f"extraction_btn_{company_id}_{year}_state" - ] = extraction_button - if st.session_state[ - f"extraction_btn_{company_id}_{year}_state" - ]: - extraction_s3_path = os.path.join( - TABLE_TRANSFORMER_EXTRACTIONS_PATH, - f"{company_id}_{year}", - ) - if fs.exists(extraction_s3_path): - text_placeholder.write( - "L'extraction existe déjà: " - "accédez-y grâce à l'onglet 'Extractions disponibles'." - ) - else: - text_placeholder.write("Extraction en cours...") - # Table extraction - table_transformer_output = ( - extract_tables_transformer(document) - ) - for table_idx, df in enumerate( - table_transformer_output - ): - # Save to persistent storage - with fs.open( - os.path.join( - extraction_s3_path, - f"table_{table_idx}.csv", - ), - "wb", - ) as f: - df.to_csv(f) - text_placeholder.write( - f"Extraction de {len(table_transformer_output)} effectuée: " - f"accédez-y grâce à l'onglet 'Extractions disponibles'." - ) - - with extract_table_tab: - # ExtractTable extraction - extract_table_button = st.button( - "Extraction des tableaux", - key=f"extract_table_btn_{company_id}_{year}", + extract_table_confidence_s3_path = os.path.join( + EXTRACT_TABLE_CONFIDENCES_PATH, + f"{company_id}_{year}", ) - text_placeholder = st.empty() - if not st.session_state.get( - f"extract_table_btn_{company_id}_{year}_state" - ): - st.session_state[ - f"extract_table_btn_{company_id}_{year}_state" - ] = extract_table_button - if st.session_state[ - f"extract_table_btn_{company_id}_{year}_state" - ]: - extract_table_s3_path = os.path.join( - EXTRACT_TABLE_EXTRACTIONS_PATH, - f"{company_id}_{year}", - ) - extract_table_confidence_s3_path = os.path.join( - EXTRACT_TABLE_CONFIDENCES_PATH, - f"{company_id}_{year}", + if fs.exists(extract_table_s3_path): + text_placeholder.write( + "L'extraction existe déjà: " + "accédez-y grâce à l'onglet 'Extractions disponibles'." ) - if fs.exists(extract_table_s3_path): - text_placeholder.write( - "L'extraction existe déjà: " - "accédez-y grâce à l'onglet 'Extractions disponibles'." - ) - else: - text_placeholder.write("Extraction en cours...") - outputs = extract_tables(document) - for table_idx, (df, df_conf) in enumerate( - outputs - ): - # Save as excel file + else: + text_placeholder.write("Extraction en cours...") + outputs = extract_tables(document) + for table_idx, (df, df_conf) in enumerate( + outputs + ): + # Save as excel file + with fs.open( + os.path.join( + extract_table_s3_path, + f"table_{table_idx}.xlsx", + ), + "wb", + ) as f: + df.to_excel(f) + # Save confidences + if df_conf is not None: with fs.open( os.path.join( - extract_table_s3_path, + extract_table_confidence_s3_path, f"table_{table_idx}.xlsx", ), "wb", ) as f: - df.to_excel(f) - # Save confidences - if df_conf is not None: - with fs.open( - os.path.join( - extract_table_confidence_s3_path, - f"table_{table_idx}.xlsx", - ), - "wb", - ) as f: - df_conf.to_excel(f) - text_placeholder.write( - f"Extraction de {len(outputs)} tableaux effectuée: " - f"accédez-y grâce à l'onglet 'Extractions disponibles'." - ) - except ValueError as e: - # Print error message. - st.write(str(e)) + df_conf.to_excel(f) + text_placeholder.write( + f"Extraction de {len(outputs)} tableaux effectuée: " + f"accédez-y grâce à l'onglet 'Extractions disponibles'." + ) + except ValueError as e: + # Print error message. + st.write(str(e)) diff --git a/app/streamlit_utils.py b/app/streamlit_utils.py index ee4ef88..80cab68 100644 --- a/app/streamlit_utils.py +++ b/app/streamlit_utils.py @@ -36,6 +36,7 @@ def sidebar_content(): """ Add side bar content for ExtractTable authentication. """ + # Add ExtractTable token input if "auth_token" not in st.session_state: st.session_state.auth_token = None token = st.sidebar.text_input("ExtractTable token", type="password", key="token") @@ -44,3 +45,22 @@ def sidebar_content(): st.session_state.auth_token = token remaining_credits = get_extract_table_credits(token) st.sidebar.write(f"Crédits restants: {remaining_credits}") + + # Add INPI credentials input + if 'inpi_auth' not in st.session_state: + st.session_state.inpi_auth = False + if "inpi_credentials" not in st.session_state: + st.session_state.credentials = {} + inpi_username = st.sidebar.text_input("Nom d'utilisateur INPI", key="inpi_username") + inpi_password = st.sidebar.text_input("Mot de passe INPI", type="password", key="inpi_password") + if st.sidebar.button("Authentification INPI"): + if inpi_username and inpi_password: + st.session_state.inpi_credentials = { + "username": inpi_username, + "password": inpi_password, + } + st.sidebar.write("Credentials INPI renseignés.") + # TODO: implement test to check credentials work + # TODO: if test passes, modify session state + if True: + st.session_state.inpi_auth = True From f4e039524a990cae98806c5f0415f62f6eecb58b Mon Sep 17 00:00:00 2001 From: tomseimandi Date: Fri, 12 Apr 2024 07:58:14 -0400 Subject: [PATCH 2/2] Edit README --- README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/README.md b/README.md index a963ca4..e27b79d 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,7 @@ ## Mise en route -Avant de lancer l'application, installer les dépendances avec `pip install -r requirements.txt`, puis renseigner les variables d'environnement: - -- `TEST_INPI_USERNAME`: nom d'utilisation du compte INPI; -- `TEST_INPI_PASSWORD`: mot de passe du compte INPI; - -Puis lancer l'application avec `streamlit run main.py --server.port=8501 --server.address=0.0.0.0` par exemple. +Avant de lancer l'application, installer les dépendances avec `pip install -r requirements.txt`, puis lancer l'application avec `streamlit run main.py --server.port=8501 --server.address=0.0.0.0` par exemple. ## Briques