From 0b8d0739c2628775c63e0f3d1a5a3fdf57d7b83e Mon Sep 17 00:00:00 2001
From: tomseimandi <tom.seimandi@gmail.com>
Date: Fri, 12 Apr 2024 07:57:27 -0400
Subject: [PATCH 1/2] Add custom INPI authentication

---
 app/pages/Nouvelle_extraction.py | 367 ++++++++++++++++---------------
 app/streamlit_utils.py           |  20 ++
 2 files changed, 205 insertions(+), 182 deletions(-)

diff --git a/app/pages/Nouvelle_extraction.py b/app/pages/Nouvelle_extraction.py
index fb368d7..994b10e 100644
--- a/app/pages/Nouvelle_extraction.py
+++ b/app/pages/Nouvelle_extraction.py
@@ -38,211 +38,214 @@
 
 # Initialize cached resources
 fs = get_file_system()
-# Document querier - requires user name and password
-document_querier = DocumentQuerier(
-    os.environ["TEST_INPI_USERNAME"], os.environ["TEST_INPI_PASSWORD"]
-)
 
-# Allow users to input year
+# Input year
 year = st.text_area(
-    label="Entrez l'année pour laquelle vous souhaitez vérifier "
-    "la disponibilité du document",
+    label="Entrez l'année du document souhaité.",
     value="2021",
     max_chars=4,
 )
 
-# Allow users to input multiple document IDs
-company_ids = st.text_area("Entrez les numéros Siren (séparés d'un espace):")
-# Split the user input into a list of document IDs
-company_ids = company_ids.split()
+# Input Siren
+company_id = st.text_area("Entrez un numéro Siren:")
 
 # Add a button to check availability for all specified documents
-dispo_button = st.button("Vérifier la disponibilité")
-if not st.session_state.get("button"):
-    st.session_state["button"] = dispo_button
-
-if st.session_state["button"]:
-    try:
-        year = int(year)
-    except ValueError:
-        st.error("Année non valide.")
-
-    if isinstance(year, int):
-        for company_id in company_ids:
-            if not check_siren_length(company_id):
-                st.error(
-                    f"Le numéro Siren {company_id} ne contient " f"pas 9 caractères."
-                )
-            else:
-                availability, document_id = check_availability(
-                    document_querier, company_id, year
+if not st.session_state.inpi_auth:
+    st.write(
+        "Vous n'êtes pas authentifié auprès le l'INPI. "
+        "Renseigner des identifiants valides à gauche."
+    )
+else:
+    document_querier = DocumentQuerier(
+        username=st.session_state.inpi_credentials.get("username"),
+        password=st.session_state.inpi_credentials.get("password"),
+    )
+
+    # Button to chck availability
+    dispo_button = st.button("Vérifier la disponibilité")
+    if not st.session_state.get("button"):
+        st.session_state["button"] = dispo_button
+
+    if st.session_state["button"]:
+        try:
+            year = int(year)
+        except ValueError:
+            st.error("Année non valide.")
+
+        if not check_siren_length(company_id):
+            st.error(
+                f"Le numéro Siren {company_id} ne contient pas 9 caractères."
+            )
+        else:
+            availability, document_id = check_availability(
+                document_querier, company_id, year
+            )
+
+            if availability:
+                file_name = f"CA_{company_id}_{year}.pdf"
+                # Display the availability status for each document
+                st.write(f"Document disponible pour le Siren {company_id}.")
+
+                PDFbyte = download_pdf(document_querier, document_id)
+                st.download_button(
+                    label="Comptes annuels",
+                    data=PDFbyte,
+                    file_name=file_name,
+                    mime="application/octet-stream",
                 )
 
-                if availability:
-                    file_name = f"CA_{company_id}_{year}.pdf"
-                    # Display the availability status for each document
-                    st.write(f"Document disponible pour le " f"Siren {company_id}.")
-
-                    PDFbyte = download_pdf(document_querier, document_id)
-                    st.download_button(
-                        label="Comptes annuels",
-                        data=PDFbyte,
-                        file_name=file_name,
-                        mime="application/octet-stream",
+                selection_button = st.button(
+                    "Identification de la page d'intérêt",
+                    key=f"page_selection_btn_{company_id}_{year}",
+                )
+                if not st.session_state.get(
+                    f"selection_button_{company_id}_{year}"
+                ):
+                    st.session_state[f"selection_button_{company_id}_{year}"] = (
+                        selection_button
                     )
+                if st.session_state[f"selection_button_{company_id}_{year}"]:
+                    try:
+                        s3_path = os.path.join(
+                            PDF_SAMPLES_PATH, f"{company_id}_{year}.pdf"
+                        )
+                        # Check if selected page file is already persisted
+                        if fs.exists(s3_path):
+                            document = read_pdf_from_s3(fs, s3_path)
+                        # Else run page selection and persist the selected page
+                        else:
+                            document = fitz.open(stream=PDFbyte, filetype="pdf")
+                            # TODO: There can be multiple pages sometimes
+                            # TODO: implement this possibility
+                            page_selection_url = (
+                                "https://extraction-cs.lab.sspcloud.fr/select_page"
+                            )
+                            files = {"pdf_file": document.tobytes()}
+                            response = requests.post(
+                                url=page_selection_url, files=files
+                            )
+                            # TODO: handle errors using result field
+                            page_number = response.json()["page_number"]
+                            st.write(
+                                f"Un tableau filiales et participations a été "
+                                f"repéré à la page {page_number + 1}."
+                            )
+                            document.select([page_number])
+                            # Save to persistent storage
+                            upload_pdf_to_s3(
+                                document=document, fs=fs, s3_path=s3_path
+                            )
 
-                    selection_button = st.button(
-                        "Identification de la page d'intérêt",
-                        key=f"page_selection_btn_{company_id}_{year}",
-                    )
-                    if not st.session_state.get(
-                        f"selection_button_{company_id}_{year}"
-                    ):
-                        st.session_state[f"selection_button_{company_id}_{year}"] = (
-                            selection_button
+                        table_transformer_tab, extract_table_tab = st.tabs(
+                            ["Table transformer", "Site ExtractTable"]
                         )
-                    if st.session_state[f"selection_button_{company_id}_{year}"]:
-                        try:
-                            s3_path = os.path.join(
-                                PDF_SAMPLES_PATH, f"{company_id}_{year}.pdf"
+
+                        # Extraction
+                        with table_transformer_tab:
+                            extraction_button = st.button(
+                                "Extraction des tableaux",
+                                key=f"extraction_btn_{company_id}_{year}",
                             )
-                            # Check if selected page file is already persisted
-                            if fs.exists(s3_path):
-                                document = read_pdf_from_s3(fs, s3_path)
-                            # Else run page selection and persist the selected page
-                            else:
-                                document = fitz.open(stream=PDFbyte, filetype="pdf")
-                                # TODO: There can be multiple pages sometimes
-                                # TODO: implement this possibility
-                                page_selection_url = (
-                                    "https://extraction-cs.lab.sspcloud.fr/select_page"
-                                )
-                                files = {"pdf_file": document.tobytes()}
-                                response = requests.post(
-                                    url=page_selection_url, files=files
-                                )
-                                # TODO: handle errors using result field
-                                page_number = response.json()["page_number"]
-                                st.write(
-                                    f"Un tableau filiales et participations a été "
-                                    f"repéré à la page {page_number + 1}."
-                                )
-                                document.select([page_number])
-                                # Save to persistent storage
-                                upload_pdf_to_s3(
-                                    document=document, fs=fs, s3_path=s3_path
+                            text_placeholder = st.empty()
+                            if not st.session_state.get(
+                                f"extraction_btn_{company_id}_{year}_state"
+                            ):
+                                st.session_state[
+                                    f"extraction_btn_{company_id}_{year}_state"
+                                ] = extraction_button
+                            if st.session_state[
+                                f"extraction_btn_{company_id}_{year}_state"
+                            ]:
+                                extraction_s3_path = os.path.join(
+                                    TABLE_TRANSFORMER_EXTRACTIONS_PATH,
+                                    f"{company_id}_{year}",
                                 )
+                                if fs.exists(extraction_s3_path):
+                                    text_placeholder.write(
+                                        "L'extraction existe déjà: "
+                                        "accédez-y grâce à l'onglet 'Extractions disponibles'."
+                                    )
+                                else:
+                                    text_placeholder.write("Extraction en cours...")
+                                    # Table extraction
+                                    table_transformer_output = (
+                                        extract_tables_transformer(document)
+                                    )
+                                    for table_idx, df in enumerate(
+                                        table_transformer_output
+                                    ):
+                                        # Save to persistent storage
+                                        with fs.open(
+                                            os.path.join(
+                                                extraction_s3_path,
+                                                f"table_{table_idx}.csv",
+                                            ),
+                                            "wb",
+                                        ) as f:
+                                            df.to_csv(f)
+                                    text_placeholder.write(
+                                        f"Extraction de {len(table_transformer_output)} effectuée: "
+                                        f"accédez-y grâce à l'onglet 'Extractions disponibles'."
+                                    )
 
-                            table_transformer_tab, extract_table_tab = st.tabs(
-                                ["Table transformer", "Site ExtractTable"]
+                        with extract_table_tab:
+                            # ExtractTable extraction
+                            extract_table_button = st.button(
+                                "Extraction des tableaux",
+                                key=f"extract_table_btn_{company_id}_{year}",
                             )
-
-                            # Extraction
-                            with table_transformer_tab:
-                                extraction_button = st.button(
-                                    "Extraction des tableaux",
-                                    key=f"extraction_btn_{company_id}_{year}",
+                            text_placeholder = st.empty()
+                            if not st.session_state.get(
+                                f"extract_table_btn_{company_id}_{year}_state"
+                            ):
+                                st.session_state[
+                                    f"extract_table_btn_{company_id}_{year}_state"
+                                ] = extract_table_button
+                            if st.session_state[
+                                f"extract_table_btn_{company_id}_{year}_state"
+                            ]:
+                                extract_table_s3_path = os.path.join(
+                                    EXTRACT_TABLE_EXTRACTIONS_PATH,
+                                    f"{company_id}_{year}",
                                 )
-                                text_placeholder = st.empty()
-                                if not st.session_state.get(
-                                    f"extraction_btn_{company_id}_{year}_state"
-                                ):
-                                    st.session_state[
-                                        f"extraction_btn_{company_id}_{year}_state"
-                                    ] = extraction_button
-                                if st.session_state[
-                                    f"extraction_btn_{company_id}_{year}_state"
-                                ]:
-                                    extraction_s3_path = os.path.join(
-                                        TABLE_TRANSFORMER_EXTRACTIONS_PATH,
-                                        f"{company_id}_{year}",
-                                    )
-                                    if fs.exists(extraction_s3_path):
-                                        text_placeholder.write(
-                                            "L'extraction existe déjà: "
-                                            "accédez-y grâce à l'onglet 'Extractions disponibles'."
-                                        )
-                                    else:
-                                        text_placeholder.write("Extraction en cours...")
-                                        # Table extraction
-                                        table_transformer_output = (
-                                            extract_tables_transformer(document)
-                                        )
-                                        for table_idx, df in enumerate(
-                                            table_transformer_output
-                                        ):
-                                            # Save to persistent storage
-                                            with fs.open(
-                                                os.path.join(
-                                                    extraction_s3_path,
-                                                    f"table_{table_idx}.csv",
-                                                ),
-                                                "wb",
-                                            ) as f:
-                                                df.to_csv(f)
-                                        text_placeholder.write(
-                                            f"Extraction de {len(table_transformer_output)} effectuée: "
-                                            f"accédez-y grâce à l'onglet 'Extractions disponibles'."
-                                        )
-
-                            with extract_table_tab:
-                                # ExtractTable extraction
-                                extract_table_button = st.button(
-                                    "Extraction des tableaux",
-                                    key=f"extract_table_btn_{company_id}_{year}",
+                                extract_table_confidence_s3_path = os.path.join(
+                                    EXTRACT_TABLE_CONFIDENCES_PATH,
+                                    f"{company_id}_{year}",
                                 )
-                                text_placeholder = st.empty()
-                                if not st.session_state.get(
-                                    f"extract_table_btn_{company_id}_{year}_state"
-                                ):
-                                    st.session_state[
-                                        f"extract_table_btn_{company_id}_{year}_state"
-                                    ] = extract_table_button
-                                if st.session_state[
-                                    f"extract_table_btn_{company_id}_{year}_state"
-                                ]:
-                                    extract_table_s3_path = os.path.join(
-                                        EXTRACT_TABLE_EXTRACTIONS_PATH,
-                                        f"{company_id}_{year}",
-                                    )
-                                    extract_table_confidence_s3_path = os.path.join(
-                                        EXTRACT_TABLE_CONFIDENCES_PATH,
-                                        f"{company_id}_{year}",
+                                if fs.exists(extract_table_s3_path):
+                                    text_placeholder.write(
+                                        "L'extraction existe déjà: "
+                                        "accédez-y grâce à l'onglet 'Extractions disponibles'."
                                     )
-                                    if fs.exists(extract_table_s3_path):
-                                        text_placeholder.write(
-                                            "L'extraction existe déjà: "
-                                            "accédez-y grâce à l'onglet 'Extractions disponibles'."
-                                        )
-                                    else:
-                                        text_placeholder.write("Extraction en cours...")
-                                        outputs = extract_tables(document)
-                                        for table_idx, (df, df_conf) in enumerate(
-                                            outputs
-                                        ):
-                                            # Save as excel file
+                                else:
+                                    text_placeholder.write("Extraction en cours...")
+                                    outputs = extract_tables(document)
+                                    for table_idx, (df, df_conf) in enumerate(
+                                        outputs
+                                    ):
+                                        # Save as excel file
+                                        with fs.open(
+                                            os.path.join(
+                                                extract_table_s3_path,
+                                                f"table_{table_idx}.xlsx",
+                                            ),
+                                            "wb",
+                                        ) as f:
+                                            df.to_excel(f)
+                                        # Save confidences
+                                        if df_conf is not None:
                                             with fs.open(
                                                 os.path.join(
-                                                    extract_table_s3_path,
+                                                    extract_table_confidence_s3_path,
                                                     f"table_{table_idx}.xlsx",
                                                 ),
                                                 "wb",
                                             ) as f:
-                                                df.to_excel(f)
-                                            # Save confidences
-                                            if df_conf is not None:
-                                                with fs.open(
-                                                    os.path.join(
-                                                        extract_table_confidence_s3_path,
-                                                        f"table_{table_idx}.xlsx",
-                                                    ),
-                                                    "wb",
-                                                ) as f:
-                                                    df_conf.to_excel(f)
-                                        text_placeholder.write(
-                                            f"Extraction de {len(outputs)} tableaux effectuée: "
-                                            f"accédez-y grâce à l'onglet 'Extractions disponibles'."
-                                        )
-                        except ValueError as e:
-                            # Print error message.
-                            st.write(str(e))
+                                                df_conf.to_excel(f)
+                                    text_placeholder.write(
+                                        f"Extraction de {len(outputs)} tableaux effectuée: "
+                                        f"accédez-y grâce à l'onglet 'Extractions disponibles'."
+                                    )
+                    except ValueError as e:
+                        # Print error message.
+                        st.write(str(e))
diff --git a/app/streamlit_utils.py b/app/streamlit_utils.py
index ee4ef88..80cab68 100644
--- a/app/streamlit_utils.py
+++ b/app/streamlit_utils.py
@@ -36,6 +36,7 @@ def sidebar_content():
     """
     Add side bar content for ExtractTable authentication.
     """
+    # Add ExtractTable token input
     if "auth_token" not in st.session_state:
         st.session_state.auth_token = None
     token = st.sidebar.text_input("ExtractTable token", type="password", key="token")
@@ -44,3 +45,22 @@ def sidebar_content():
             st.session_state.auth_token = token
             remaining_credits = get_extract_table_credits(token)
             st.sidebar.write(f"Crédits restants: {remaining_credits}")
+
+    # Add INPI credentials input
+    if 'inpi_auth' not in st.session_state:
+        st.session_state.inpi_auth = False
+    if "inpi_credentials" not in st.session_state:
+        st.session_state.credentials = {}
+    inpi_username = st.sidebar.text_input("Nom d'utilisateur INPI", key="inpi_username")
+    inpi_password = st.sidebar.text_input("Mot de passe INPI", type="password", key="inpi_password")
+    if st.sidebar.button("Authentification INPI"):
+        if inpi_username and inpi_password:
+            st.session_state.inpi_credentials = {
+                "username": inpi_username,
+                "password": inpi_password,
+            }
+            st.sidebar.write("Credentials INPI renseignés.")
+            # TODO: implement test to check credentials work
+            # TODO: if test passes, modify session state
+            if True:
+                st.session_state.inpi_auth = True

From f4e039524a990cae98806c5f0415f62f6eecb58b Mon Sep 17 00:00:00 2001
From: tomseimandi <tom.seimandi@gmail.com>
Date: Fri, 12 Apr 2024 07:58:14 -0400
Subject: [PATCH 2/2] Edit README

---
 README.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a963ca4..e27b79d 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,7 @@
 
 ## Mise en route
 
-Avant de lancer l'application, installer les dépendances avec `pip install -r requirements.txt`, puis renseigner les variables d'environnement:
-
-- `TEST_INPI_USERNAME`: nom d'utilisation du compte INPI;
-- `TEST_INPI_PASSWORD`: mot de passe du compte INPI;
-
-Puis lancer l'application avec `streamlit run main.py --server.port=8501 --server.address=0.0.0.0` par exemple.
+Avant de lancer l'application, installer les dépendances avec `pip install -r requirements.txt`, puis lancer l'application avec `streamlit run main.py --server.port=8501 --server.address=0.0.0.0` par exemple.
 
 ## Briques