trips/app.py at main · umilISLab/trips · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import json
import os
import glob
from datetime import datetime
import streamlit as st
import pandas as pd

# --- Configuration ---
PROCESSED_DIR = "./output/processed"
CONVOCATIONS_FILE_PATH = "./data/convocations.csv"
PDF_VIEW_HEIGHT = 800  # pixels


@st.cache_data
def load_convocations_data():
    """Loads the convocations CSV file."""
    return pd.read_csv(
        CONVOCATIONS_FILE_PATH,
        sep=";",
    )


def get_document_options():
    """
    Scans the processed directory for JSON files, parses their names,
    sorts them by date, and returns formatted options for the selector.
    Returns a list of tuples: (display_name, filename, doc_id, pdf_id)
    """
    options = []
    json_files = glob.glob(os.path.join(PROCESSED_DIR, "*.json"))

    for filepath in json_files:
        filename = os.path.basename(filepath)
        parts = filename.replace(".json", "").split("-")
        doc_id, pdf_id_str, date_str = parts

        year, month, day = int(date_str[:4]), int(date_str[4:6]), int(date_str[6:8])
        date_obj = datetime(year, month, day)
        display_name = f"{day:02d}/{month:02d}/{year} - {doc_id}.{pdf_id_str}"
        options.append(
            {
                "display_name": display_name,
                "filename": filename,
                "filepath": filepath,
                "doc_id": doc_id,
                "pdf_id": pdf_id_str,
                "date": date_obj,
            }
        )

    # Sort options by date (oldest to newest)
    options.sort(key=lambda x: x["date"])
    return options


def get_pdf_url(df_convocations, document_id_str, pdf_id_str):
    """
    Retrieves the PDF URL from the convocations dataframe.
    """
    doc_id = document_id_str
    pdf_idx = int(pdf_id_str)
    seduta_uri = f"http://dati.camera.it/ocd/seduta.rdf/{doc_id}"

    filtered_df = df_convocations[df_convocations["seduta_uri"] == seduta_uri]
    pdf_links_str = filtered_df.iloc[0]["seduta_url"]
    pdf_links_list = pdf_links_str.split(";")
    return pdf_links_list[pdf_idx].strip()


def display_pdf_from_url_iframe(pdf_url):
    """
    Fetches a PDF from a URL and displays it in an iframe.
    """
    pdf_display = f'<embed src="{pdf_url}" width="100%" height="{PDF_VIEW_HEIGHT}" type="application/pdf">'
    st.markdown(pdf_display, unsafe_allow_html=True)


def format_transcription_entry(entry):
    """Formats a single transcription entry into markdown."""
    content = entry.get("content", "")
    speaker = entry.get("speaker", "none")
    text_type = entry.get("type", "text")
    speaker_uri = entry.get("speaker_uri", "none")

    prefix = ""
    if speaker not in ["none", "unknown"]:
        if speaker_uri and speaker_uri[0] != "none":
            prefix = (
                f"**[{speaker.strip()}]({speaker_uri[0]})**"
                if len(speaker_uri) == 1
                else f"**{speaker.strip()}**"
            )
        else:
            prefix = f"*{speaker.strip()}*"

    if text_type in ["note", "footnote"]:
        return f"{prefix} _{content}_"
    if text_type in ["page-header", "section-header"]:
        return f"#### {prefix}{content}"
    # 'text' and any other type
    return f"{prefix} {content}"


# --- Streamlit App Layout ---
st.set_page_config(layout="wide")
st.title("🇮🇹 Italian Parliamentary Speeches")

df_convocations = load_convocations_data()
doc_options_data = get_document_options()

# Create a mapping from display name to the full option dictionary
options_dict = {opt["display_name"]: opt for opt in doc_options_data}

# Selector for documents
selected_display_name = st.selectbox(
    "Select Document:",
    options=[opt["display_name"] for opt in doc_options_data],
    index=None,
    placeholder="Select a document...",
    label_visibility="collapsed",
)

if selected_display_name is not None:
    selected_option = options_dict[selected_display_name]

    st.markdown("---")  # Separator

    # --- Layout with columns ---
    col_transcription, col_pdf = st.columns(2)

    # --- Transcription Column (Left) ---
    with col_transcription:
        st.subheader("📜 Transcription")
        with open(selected_option["filepath"], "r", encoding="utf-8") as f:
            transcription_data = json.load(f)

        # Use st.container with a fixed height for scrollability
        with st.container(height=PDF_VIEW_HEIGHT):  # Match PDF view height
            if isinstance(transcription_data, list):
                for entry in transcription_data:
                    st.markdown(
                        format_transcription_entry(entry),
                        unsafe_allow_html=True,
                    )

    # --- PDF Column (Right) ---
    with col_pdf:
        st.subheader("📄 PDF Document")
        pdf_url = get_pdf_url(
            df_convocations, selected_option["doc_id"], selected_option["pdf_id"]
        )
        display_pdf_from_url_iframe(pdf_url)