From 003a03ac191bb6f7069b92385eb996ebf901b352 Mon Sep 17 00:00:00 2001 From: Lawrence Wu Date: Tue, 24 Sep 2024 13:59:49 -0700 Subject: [PATCH] Convert to dynamic data fetching from database Related to #2 --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/lawwu/transcripts/issues/2?shareId=XXXX-XXXX-XXXX-XXXX). --- data/database_schema.sql | 21 ++++++++ src/transcripts/generate_html.py | 27 ++++++++++ src/transcripts/transcribe_audio.py | 38 ++++++++++++++ src/transcripts/transcribe_new_videos.py | 44 +++++++++++++++++ src/transcripts/transcribe_youtube.py | 38 ++++++++++++++ src/transcripts/utils.py | 63 ++++++++++++++++++++++++ 6 files changed, 231 insertions(+) create mode 100644 data/database_schema.sql diff --git a/data/database_schema.sql b/data/database_schema.sql new file mode 100644 index 000000000..36f962c36 --- /dev/null +++ b/data/database_schema.sql @@ -0,0 +1,21 @@ +-- SQL script to define the database schema + +-- Create table for storing video details +CREATE TABLE IF NOT EXISTS video_details ( + video_id TEXT PRIMARY KEY, + title TEXT NOT NULL, + upload_date TEXT, + duration INTEGER, + channel_name TEXT +); + +-- Create table for storing transcript data +CREATE TABLE IF NOT EXISTS transcripts ( + video_id TEXT PRIMARY KEY, + transcript TEXT NOT NULL, + FOREIGN KEY (video_id) REFERENCES video_details (video_id) +); + +-- Add necessary indexes for efficient querying +CREATE INDEX IF NOT EXISTS idx_video_details_upload_date ON video_details (upload_date); +CREATE INDEX IF NOT EXISTS idx_transcripts_video_id ON transcripts (video_id); diff --git a/src/transcripts/generate_html.py b/src/transcripts/generate_html.py index 76d4737e7..44a0025a5 100644 --- a/src/transcripts/generate_html.py +++ b/src/transcripts/generate_html.py @@ -4,6 +4,7 @@ import re from yt_dlp import YoutubeDL from datetime import datetime +import sqlite3 from transcripts.utils import ( transcripts_dir, @@ -587,6 +588,32 @@ def generate_transcript_page(video_id): logging.info(f"Transcript page generated at {output_file}") +def connect_to_db(): + try: + conn = sqlite3.connect(data_dir / "transcripts.db") + return conn + except sqlite3.Error as e: + logging.error(f"Error connecting to database: {e}") + return None + + +def fetch_transcript_from_db(video_id): + conn = connect_to_db() + if not conn: + return None + + try: + cursor = conn.cursor() + cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,)) + result = cursor.fetchone() + return result[0] if result else None + except sqlite3.Error as e: + logging.error(f"Error fetching transcript from database: {e}") + return None + finally: + conn.close() + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Generate HTML from YouTube transcript" diff --git a/src/transcripts/transcribe_audio.py b/src/transcripts/transcribe_audio.py index 87f15f3e8..69a31624a 100644 --- a/src/transcripts/transcribe_audio.py +++ b/src/transcripts/transcribe_audio.py @@ -4,12 +4,14 @@ import re import ffmpeg import logging +import sqlite3 from transcripts.utils import ( transcripts_dir, model_dir, whispercpp_dir, timeit, + data_dir, ) log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -166,6 +168,33 @@ def clean_up_wav_files(base_filename): logging.error(f"Error during file cleanup: {e}") +def connect_to_db(): + try: + conn = sqlite3.connect(data_dir / "transcripts.db") + return conn + except sqlite3.Error as e: + logging.error(f"Error connecting to database: {e}") + return None + + +def insert_transcript_to_db(video_id, transcript): + conn = connect_to_db() + if not conn: + return + + try: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)", + (video_id, transcript), + ) + conn.commit() + except sqlite3.Error as e: + logging.error(f"Error inserting transcript into database: {e}") + finally: + conn.close() + + def main(): parser = argparse.ArgumentParser( description="Automate downloading and processing audio from YouTube/Vimeo." @@ -191,6 +220,15 @@ def main(): logging.info("Run whisper.cpp using Metal") run_whisper(args.name, args.model_name) + # Read the generated transcript + transcript_path = transcripts_dir / f"{args.name}.txt" + try: + with open(transcript_path, "r", encoding="utf-8") as f: + transcript = f.read() + insert_transcript_to_db(args.name, transcript) + except Exception as e: + logging.error(f"Error reading transcript file: {e}") + # Clean up WAV files after processing # clean_up_wav_files(wav_file_name) diff --git a/src/transcripts/transcribe_new_videos.py b/src/transcripts/transcribe_new_videos.py index 44098cf7c..7b97d63ca 100644 --- a/src/transcripts/transcribe_new_videos.py +++ b/src/transcripts/transcribe_new_videos.py @@ -2,6 +2,7 @@ import json import logging import os +import sqlite3 from transcripts.utils import data_dir, configs_dir from pathlib import Path @@ -70,6 +71,33 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path): f.write("") +def connect_to_db(): + try: + conn = sqlite3.connect(data_dir / "transcripts.db") + return conn + except sqlite3.Error as e: + logging.error(f"Error connecting to database: {e}") + return None + + +def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name): + conn = connect_to_db() + if not conn: + return + + try: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)", + (video_id, title, upload_date, duration, channel_name), + ) + conn.commit() + except sqlite3.Error as e: + logging.error(f"Error inserting video details into database: {e}") + finally: + conn.close() + + # Download other videos (manual list) # Step 2: Load existing video ids from the done file other_done_ids = load_existing_ids("other_ids_done.txt") @@ -159,6 +187,22 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path): # Transcribe new videos subprocess.run(["./bash_transcribe.sh", f"./data/{channel['in_file']}"]) + # Insert new video details into the database + for vid in new_ids: + # Fetch video details using yt-dlp + cmd = f"yt-dlp -j -- {vid}" + result = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True, text=True) + video_details = json.loads(result.stdout.strip()) + + # Extract relevant details + title = video_details.get("title", "") + upload_date = video_details.get("upload_date", "") + duration = video_details.get("duration", 0) + channel_name = channel["name"] + + # Insert video details into the database + insert_video_details_to_db(vid, title, upload_date, duration, channel_name) + # Generate html and output to docs/ subprocess.run(["python", "src/transcripts/generate_html.py"]) diff --git a/src/transcripts/transcribe_youtube.py b/src/transcripts/transcribe_youtube.py index 278d780fb..c06887bb6 100644 --- a/src/transcripts/transcribe_youtube.py +++ b/src/transcripts/transcribe_youtube.py @@ -4,12 +4,14 @@ import re import ffmpeg import logging +import sqlite3 from transcripts.utils import ( transcripts_dir, model_dir, whispercpp_dir, timeit, + data_dir, ) log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -164,6 +166,33 @@ def clean_up_wav_files(base_filename): logging.error(f"Error during file cleanup: {e}") +def connect_to_db(): + try: + conn = sqlite3.connect(data_dir / "transcripts.db") + return conn + except sqlite3.Error as e: + logging.error(f"Error connecting to database: {e}") + return None + + +def insert_transcript_to_db(video_id, transcript): + conn = connect_to_db() + if not conn: + return + + try: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)", + (video_id, transcript), + ) + conn.commit() + except sqlite3.Error as e: + logging.error(f"Error inserting transcript into database: {e}") + finally: + conn.close() + + def main(): parser = argparse.ArgumentParser( description="Automate downloading and processing audio from YouTube/Vimeo." @@ -191,6 +220,15 @@ def main(): logging.info("Run whisper.cpp using Metal") run_whisper(wav_file_name, args.model_name) + # Read the generated transcript + transcript_path = transcripts_dir / f"{wav_file_name}.txt" + try: + with open(transcript_path, "r", encoding="utf-8") as f: + transcript = f.read() + insert_transcript_to_db(wav_file_name, transcript) + except Exception as e: + logging.error(f"Error reading transcript file: {e}") + # Clean up WAV files after processing clean_up_wav_files(wav_file_name) diff --git a/src/transcripts/utils.py b/src/transcripts/utils.py index 18608a2f3..6d64521fa 100644 --- a/src/transcripts/utils.py +++ b/src/transcripts/utils.py @@ -5,6 +5,7 @@ import json import pandas as pd from datetime import datetime +import sqlite3 project_dir = Path(__file__).resolve().parents[2] data_dir = project_dir / "data" @@ -79,3 +80,65 @@ def extract_yt_id(url): else: # Return None or an appropriate message if no video ID is found return None + + +def connect_to_db(): + try: + conn = sqlite3.connect(data_dir / "transcripts.db") + return conn + except sqlite3.Error as e: + logging.error(f"Error connecting to database: {e}") + return None + + +def fetch_transcript_from_db(video_id): + conn = connect_to_db() + if not conn: + return None + + try: + cursor = conn.cursor() + cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,)) + result = cursor.fetchone() + return result[0] if result else None + except sqlite3.Error as e: + logging.error(f"Error fetching transcript from database: {e}") + return None + finally: + conn.close() + + +def insert_transcript_to_db(video_id, transcript): + conn = connect_to_db() + if not conn: + return + + try: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)", + (video_id, transcript), + ) + conn.commit() + except sqlite3.Error as e: + logging.error(f"Error inserting transcript into database: {e}") + finally: + conn.close() + + +def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name): + conn = connect_to_db() + if not conn: + return + + try: + cursor = conn.cursor() + cursor.execute( + "INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)", + (video_id, title, upload_date, duration, channel_name), + ) + conn.commit() + except sqlite3.Error as e: + logging.error(f"Error inserting video details into database: {e}") + finally: + conn.close()