From 003a03ac191bb6f7069b92385eb996ebf901b352 Mon Sep 17 00:00:00 2001
From: Lawrence Wu <lawwu@users.noreply.github.com>
Date: Tue, 24 Sep 2024 13:59:49 -0700
Subject: [PATCH] Convert to dynamic data fetching from database

Related to #2

---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/lawwu/transcripts/issues/2?shareId=XXXX-XXXX-XXXX-XXXX).
---
 data/database_schema.sql                 | 21 ++++++++
 src/transcripts/generate_html.py         | 27 ++++++++++
 src/transcripts/transcribe_audio.py      | 38 ++++++++++++++
 src/transcripts/transcribe_new_videos.py | 44 +++++++++++++++++
 src/transcripts/transcribe_youtube.py    | 38 ++++++++++++++
 src/transcripts/utils.py                 | 63 ++++++++++++++++++++++++
 6 files changed, 231 insertions(+)
 create mode 100644 data/database_schema.sql

diff --git a/data/database_schema.sql b/data/database_schema.sql
new file mode 100644
index 000000000..36f962c36
--- /dev/null
+++ b/data/database_schema.sql
@@ -0,0 +1,21 @@
+-- SQL script to define the database schema
+
+-- Create table for storing video details
+CREATE TABLE IF NOT EXISTS video_details (
+    video_id TEXT PRIMARY KEY,
+    title TEXT NOT NULL,
+    upload_date TEXT,
+    duration INTEGER,
+    channel_name TEXT
+);
+
+-- Create table for storing transcript data
+CREATE TABLE IF NOT EXISTS transcripts (
+    video_id TEXT PRIMARY KEY,
+    transcript TEXT NOT NULL,
+    FOREIGN KEY (video_id) REFERENCES video_details (video_id)
+);
+
+-- Add necessary indexes for efficient querying
+CREATE INDEX IF NOT EXISTS idx_video_details_upload_date ON video_details (upload_date);
+CREATE INDEX IF NOT EXISTS idx_transcripts_video_id ON transcripts (video_id);
diff --git a/src/transcripts/generate_html.py b/src/transcripts/generate_html.py
index 76d4737e7..44a0025a5 100644
--- a/src/transcripts/generate_html.py
+++ b/src/transcripts/generate_html.py
@@ -4,6 +4,7 @@
 import re
 from yt_dlp import YoutubeDL
 from datetime import datetime
+import sqlite3
 
 from transcripts.utils import (
     transcripts_dir,
@@ -587,6 +588,32 @@ def generate_transcript_page(video_id):
     logging.info(f"Transcript page generated at {output_file}")
 
 
+def connect_to_db():
+    try:
+        conn = sqlite3.connect(data_dir / "transcripts.db")
+        return conn
+    except sqlite3.Error as e:
+        logging.error(f"Error connecting to database: {e}")
+        return None
+
+
+def fetch_transcript_from_db(video_id):
+    conn = connect_to_db()
+    if not conn:
+        return None
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,))
+        result = cursor.fetchone()
+        return result[0] if result else None
+    except sqlite3.Error as e:
+        logging.error(f"Error fetching transcript from database: {e}")
+        return None
+    finally:
+        conn.close()
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Generate HTML from YouTube transcript"
diff --git a/src/transcripts/transcribe_audio.py b/src/transcripts/transcribe_audio.py
index 87f15f3e8..69a31624a 100644
--- a/src/transcripts/transcribe_audio.py
+++ b/src/transcripts/transcribe_audio.py
@@ -4,12 +4,14 @@
 import re
 import ffmpeg
 import logging
+import sqlite3
 
 from transcripts.utils import (
     transcripts_dir,
     model_dir,
     whispercpp_dir,
     timeit,
+    data_dir,
 )
 
 log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -166,6 +168,33 @@ def clean_up_wav_files(base_filename):
         logging.error(f"Error during file cleanup: {e}")
 
 
+def connect_to_db():
+    try:
+        conn = sqlite3.connect(data_dir / "transcripts.db")
+        return conn
+    except sqlite3.Error as e:
+        logging.error(f"Error connecting to database: {e}")
+        return None
+
+
+def insert_transcript_to_db(video_id, transcript):
+    conn = connect_to_db()
+    if not conn:
+        return
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
+            (video_id, transcript),
+        )
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting transcript into database: {e}")
+    finally:
+        conn.close()
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Automate downloading and processing audio from YouTube/Vimeo."
@@ -191,6 +220,15 @@ def main():
     logging.info("Run whisper.cpp using Metal")
     run_whisper(args.name, args.model_name)
 
+    # Read the generated transcript
+    transcript_path = transcripts_dir / f"{args.name}.txt"
+    try:
+        with open(transcript_path, "r", encoding="utf-8") as f:
+            transcript = f.read()
+        insert_transcript_to_db(args.name, transcript)
+    except Exception as e:
+        logging.error(f"Error reading transcript file: {e}")
+
     # Clean up WAV files after processing
     # clean_up_wav_files(wav_file_name)
 
diff --git a/src/transcripts/transcribe_new_videos.py b/src/transcripts/transcribe_new_videos.py
index 44098cf7c..7b97d63ca 100644
--- a/src/transcripts/transcribe_new_videos.py
+++ b/src/transcripts/transcribe_new_videos.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import os
+import sqlite3
 from transcripts.utils import data_dir, configs_dir
 from pathlib import Path
 
@@ -70,6 +71,33 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path):
             f.write("")
 
 
+def connect_to_db():
+    try:
+        conn = sqlite3.connect(data_dir / "transcripts.db")
+        return conn
+    except sqlite3.Error as e:
+        logging.error(f"Error connecting to database: {e}")
+        return None
+
+
+def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name):
+    conn = connect_to_db()
+    if not conn:
+        return
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)",
+            (video_id, title, upload_date, duration, channel_name),
+        )
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting video details into database: {e}")
+    finally:
+        conn.close()
+
+
 # Download other videos (manual list)
 # Step 2: Load existing video ids from the done file
 other_done_ids = load_existing_ids("other_ids_done.txt")
@@ -159,6 +187,22 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path):
         # Transcribe new videos
         subprocess.run(["./bash_transcribe.sh", f"./data/{channel['in_file']}"])
 
+        # Insert new video details into the database
+        for vid in new_ids:
+            # Fetch video details using yt-dlp
+            cmd = f"yt-dlp -j -- {vid}"
+            result = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True, text=True)
+            video_details = json.loads(result.stdout.strip())
+
+            # Extract relevant details
+            title = video_details.get("title", "")
+            upload_date = video_details.get("upload_date", "")
+            duration = video_details.get("duration", 0)
+            channel_name = channel["name"]
+
+            # Insert video details into the database
+            insert_video_details_to_db(vid, title, upload_date, duration, channel_name)
+
 # Generate html and output to docs/
 subprocess.run(["python", "src/transcripts/generate_html.py"])
 
diff --git a/src/transcripts/transcribe_youtube.py b/src/transcripts/transcribe_youtube.py
index 278d780fb..c06887bb6 100644
--- a/src/transcripts/transcribe_youtube.py
+++ b/src/transcripts/transcribe_youtube.py
@@ -4,12 +4,14 @@
 import re
 import ffmpeg
 import logging
+import sqlite3
 
 from transcripts.utils import (
     transcripts_dir,
     model_dir,
     whispercpp_dir,
     timeit,
+    data_dir,
 )
 
 log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -164,6 +166,33 @@ def clean_up_wav_files(base_filename):
         logging.error(f"Error during file cleanup: {e}")
 
 
+def connect_to_db():
+    try:
+        conn = sqlite3.connect(data_dir / "transcripts.db")
+        return conn
+    except sqlite3.Error as e:
+        logging.error(f"Error connecting to database: {e}")
+        return None
+
+
+def insert_transcript_to_db(video_id, transcript):
+    conn = connect_to_db()
+    if not conn:
+        return
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
+            (video_id, transcript),
+        )
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting transcript into database: {e}")
+    finally:
+        conn.close()
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Automate downloading and processing audio from YouTube/Vimeo."
@@ -191,6 +220,15 @@ def main():
     logging.info("Run whisper.cpp using Metal")
     run_whisper(wav_file_name, args.model_name)
 
+    # Read the generated transcript
+    transcript_path = transcripts_dir / f"{wav_file_name}.txt"
+    try:
+        with open(transcript_path, "r", encoding="utf-8") as f:
+            transcript = f.read()
+        insert_transcript_to_db(wav_file_name, transcript)
+    except Exception as e:
+        logging.error(f"Error reading transcript file: {e}")
+
     # Clean up WAV files after processing
     clean_up_wav_files(wav_file_name)
 
diff --git a/src/transcripts/utils.py b/src/transcripts/utils.py
index 18608a2f3..6d64521fa 100644
--- a/src/transcripts/utils.py
+++ b/src/transcripts/utils.py
@@ -5,6 +5,7 @@
 import json
 import pandas as pd
 from datetime import datetime
+import sqlite3
 
 project_dir = Path(__file__).resolve().parents[2]
 data_dir = project_dir / "data"
@@ -79,3 +80,65 @@ def extract_yt_id(url):
     else:
         # Return None or an appropriate message if no video ID is found
         return None
+
+
+def connect_to_db():
+    try:
+        conn = sqlite3.connect(data_dir / "transcripts.db")
+        return conn
+    except sqlite3.Error as e:
+        logging.error(f"Error connecting to database: {e}")
+        return None
+
+
+def fetch_transcript_from_db(video_id):
+    conn = connect_to_db()
+    if not conn:
+        return None
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,))
+        result = cursor.fetchone()
+        return result[0] if result else None
+    except sqlite3.Error as e:
+        logging.error(f"Error fetching transcript from database: {e}")
+        return None
+    finally:
+        conn.close()
+
+
+def insert_transcript_to_db(video_id, transcript):
+    conn = connect_to_db()
+    if not conn:
+        return
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
+            (video_id, transcript),
+        )
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting transcript into database: {e}")
+    finally:
+        conn.close()
+
+
+def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name):
+    conn = connect_to_db()
+    if not conn:
+        return
+
+    try:
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)",
+            (video_id, title, upload_date, duration, channel_name),
+        )
+        conn.commit()
+    except sqlite3.Error as e:
+        logging.error(f"Error inserting video details into database: {e}")
+    finally:
+        conn.close()