dorushugo · dorushugo · Jan 30, 2025 · Feb 4, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/scrapping/main.py b/scrapping/main.py
@@ -0,0 +1,251 @@
+import requests
+from bs4 import BeautifulSoup
+import csv # Importation du module csv
+import multiprocessing # Importation du module multiprocessing
+import re # Importation du module re
+from supabase import create_client, Client
+import os
+import json
+from openai import OpenAI
+import numpy as np
+
+# Supabase credentials
+SUPABASE_URL = "https://ggjjbljpisptelylnczn.supabase.co"
+SUPABASE_SERVICE_ROLE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImdnampibGpwaXNwdGVseWxuY3puIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTczODIzNDY0OSwiZXhwIjoyMDUzODEwNjQ5fQ.VAKzwStH8XL1bZTOJXUKWszDjE0QpwRB0CYkQU39Xrk"
+
+OPEN_AI_SECRET_KEY="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"
+
+# Initialiser le client OpenAI
+client = OpenAI(api_key=OPEN_AI_SECRET_KEY)
+
+def get_embedding(text: str) -> list[float]:
+    """Génère un embedding pour le texte donné en utilisant OpenAI"""
+    try:
+        # Limiter la taille du texte à 8191 tokens (limite du modèle)
+        truncated_text = text[:8000]
+        response = client.embeddings.create(
+            input=truncated_text,
+            model="text-embedding-3-small"
+        )
+        return response.data[0].embedding
+    except Exception as e:
+        print(f"Erreur lors de la génération de l'embedding : {str(e)}")
+        return []
+
+def scrape_war_summary(war_url, war_id):
+    """
+    Scrape Wikipedia page for a war and extract summary, name, date, and full content.
+
+    Args:
+        war_url (str): URL of the Wikipedia page for the war.
+        war_id (int): Unique identifier for the war.
+
+    Returns:
+        dict: Dictionary containing war name, date, summary text, full content, and URL.
+              Returns None if scraping fails.
+    """
+    print(f"Processus {multiprocessing.current_process().name} - Début du scraping de la page de guerre: {war_url}") # LOG: Ajout du nom du processus
+    try:
+        response = requests.get(war_url)
+        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # 📌 Titre de la guerre
+        war_name = soup.find("h1").text.strip()
+
+        # 📌 Extraction de l'infobox (tableau avec les infos générales)
+        caption = soup.find("caption", string="Informations générales")  # Cherche le caption spécifique
+        general_info = {}
+        info_table = None
+        if caption:
+            info_table = caption.find_parent("table")
+        if info_table:
+            rows = info_table.find_all("tr")  # Trouver toutes les lignes du tableau
+
+            for row in rows:
+                header = row.find("th")  # Colonne de gauche (nom de l'info)
+                value = row.find("td")   # Colonne de droite (valeur associée)
+
+                if header and value:
+                    key = header.get_text(strip=True)
+
+                    # 🔎 Gestion des balises <time> et <a>
+                    if value.find("time"):
+                        val = " - ".join(time.get_text(strip=True) for time in value.find_all("time"))
+                    elif value.find("a"):
+                        val = ", ".join(a.get_text(strip=True) for a in value.find_all("a"))
+                    else:
+                        val = value.get_text(strip=True)
+
+                    general_info[key] = val
+
+        # Transformation de general_info en une seule chaîne de texte
+        general_info_text = "; ".join(f"{key}: {value}" for key, value in general_info.items())
+
+        # 📌 Extraction des paragraphes
+        paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()]
+
+        # 📖 Résumé = 3 premiers paragraphes
+        summary = " ".join(paragraphs[:3])
+
+        # 📖 Conclusion = 2-3 derniers paragraphes (en évitant les références)
+        last_paragraphs = paragraphs[-5:]
+        conclusion = []
+        for para in reversed(last_paragraphs):
+            if len(conclusion) < 3:
+                conclusion.insert(0, para)
+
+        conclusion_text = " ".join(conclusion)
+
+        # 📚 Extraction des sections principales (sans "[modifier | modifier le code]")
+        sections = [h2.text.strip().replace("[modifier | modifier le code]", "").strip() for h2 in soup.find_all("div", class_="mw-heading mw-heading2")]
+
+        # Extraction du contenu complet
+        content_div = soup.select_one('div#mw-content-text > div.mw-parser-output')
+        full_content = content_div.get_text(" ", strip=True) if content_div else "Contenu non trouvé"
+
+        # Génération de l'embedding sur le titre au lieu du résumé
+        title_embedding = get_embedding(war_name) if war_name else []
+
+        # Construction des données
+        war_data = {
+            "id": war_id,
+            "name": war_name,
+            "general_info": general_info_text,
+            "sections": sections,
+            "summary": summary,
+            "title_embedding": title_embedding,
+            "conclusion": conclusion_text,
+            "full_content": full_content,
+            "url": war_url
+        }
+
+        # Affichage des résultats
+        print("\n" + "="*50)
+        print(f"📌 Titre : {war_data['name']}")
+        print(f"🌍 Infos générales : {general_info_text[:100]}...")
+        print(f"📚 Sections : {', '.join(sections[:3])}...")
+        print(f"📖 Résumé ({len(summary)} caractères) : {summary[:200]}...")
+        print("="*50 + "\n")
+
+        return war_data
+
+    except requests.exceptions.RequestException as e:
+        print(f"Processus {multiprocessing.current_process().name} - Erreur de requête HTTP lors de la récupération de l'URL {war_url}: {e}") # LOG: Ajout du nom du processus
+        return None
+    except Exception as e:
+        print(f"Processus {multiprocessing.current_process().name} - Erreur lors du traitement de l'URL {war_url}: {e}") # LOG: Ajout du nom du processus
+        return None
+
+
+def get_list_of_wars_urls(list_page_url):
+    """
+    Scrape a Wikipedia list page to get URLs of war pages.
+
+    Args:
+        list_page_url (str): URL of the Wikipedia page listing wars.
+
+    Returns:
+        list: List of Wikipedia URLs for wars.
+    """
+    print(f"Début de la récupération de la liste des URLs de guerres depuis: {list_page_url}") # LOG: Début récupération liste URLs
+    try:
+        response = requests.get(list_page_url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        war_urls = []
+        # Adjust selector based on the structure of the list page
+        list_items = soup.select('div#mw-content-text div.mw-parser-output ul li a[href^="/wiki/"]') # Common list format
+        for item in list_items:
+            war_page_url = "https://fr.wikipedia.org" + item['href']
+            if "/wiki/Liste_de" not in war_page_url and "/wiki/Guerre_" in war_page_url: # Filter out list pages and keep only war pages (heuristic)
+                 war_urls.append(war_page_url)
+
+        print("URLs de guerres trouvées :", war_urls) # LOG: URLs de guerres trouvées
+        print(f"Liste des URLs de guerres récupérée avec succès depuis: {list_page_url}") # LOG: Fin récupération liste URLs
+        return war_urls
+
+    except requests.exceptions.RequestException as e:
+        print(f"Erreur de requête HTTP lors de la récupération de la page de liste {list_page_url}: {e}")
+        return []
+    except Exception as e:
+        print(f"Erreur lors du traitement de la page de liste {list_page_url}: {e}")
+        return []
+
+
+def test_supabase_connection():
+    try:
+        supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+        result = supabase.table('wars').select("count", count="exact").execute()
+        print("✅ Connexion à Supabase réussie")
+        print(f"Nombre d'entrées dans la table : {result.count}")
+    except Exception as e:
+        print(f"🚨 Erreur de connexion : {str(e)}")
+        exit()
+
+
+if __name__ == "__main__":
+    test_supabase_connection()
+    list_of_wars_page_url = "https://fr.wikipedia.org/wiki/Liste_des_guerres"
+    print(f"URL de la page de liste des guerres utilisée: {list_of_wars_page_url}")
+    war_urls = get_list_of_wars_urls(list_of_wars_page_url)
+
+    if war_urls:
+        all_wars_data = []
+        print(f"Début du scraping des pages de guerres en parallèle avec multiprocessing...")
+
+        with multiprocessing.Pool() as pool:
+            results = pool.starmap(scrape_war_summary, [(url, i) for i, url in enumerate(war_urls)])
+
+        all_wars_data = [war_data for war_data in results if war_data]
+
+        print(f"Fin du scraping. Début de l'insertion dans Supabase...")
+
+        # Initialisation du client Supabase avec vérification
+        try:
+            supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+            print("Connexion à Supabase réussie")
+        except Exception as e:
+            print(f"Échec de la connexion à Supabase : {str(e)}")
+            exit()
+
+        # Vérification de l'existence de la table
+        try:
+            test = supabase.table("wars").select("count", count="exact").execute()
+            print("La table 'wars' existe bien")
+        except Exception as e:
+            print(f"La table 'wars' n'existe pas ou erreur d'accès : {str(e)}")
+            exit()
+
+        # Insertion des données avec meilleure gestion d'erreur
+        for war in all_wars_data:
+            try:
+                # Convertir les listes en JSON
+                war['sections'] = json.dumps(war.get('sections', []))
+
+                # Nettoyer les textes
+                for key in ['summary', 'conclusion', 'full_content']:
+                    if key in war:
+                        war[key] = war[key].encode('utf-8', 'ignore').decode('utf-8')
+
+                # Debug: Afficher les données avant insertion
+                print(f"Données à insérer : {json.dumps(war, indent=2, ensure_ascii=False)}")
+
+                # Insertion
+                if 'title_embedding' in war and war['title_embedding']:
+                    war['title_embedding'] = np.array(war['title_embedding']).tolist()
+                data, count = supabase.table("wars").insert(war).execute()
+                print(f"✅ Insertion réussie : {war['name']}")
+
+            except Exception as e:
+                print(f"🚨 Erreur d'insertion : {str(e)}")
+                if hasattr(e, 'args') and len(e.args) > 0:
+                    print(f"Détails de l'erreur : {e.args[0]}")
+                continue
+
+        print("Insertion terminée avec succès !")
+
+    else:
+        print("Aucune URL de guerre trouvée.")
+    print("Script terminé.")
diff --git a/webapp/.gitignore b/webapp/.gitignore
@@ -0,0 +1,41 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/webapp/README.md b/webapp/README.md
@@ -0,0 +1,36 @@
+This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
+
+## Getting Started
+
+First, run the development server:
+
+```bash
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
+# or
+bun dev
+```
+
+Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
+
+You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
+
+This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
+
+## Learn More
+
+To learn more about Next.js, take a look at the following resources:
+
+- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
+- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
+
+You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
+
+## Deploy on Vercel
+
+The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
+
+Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.