Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions data/database_schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- SQL script to define the database schema

-- Create table for storing video details
CREATE TABLE IF NOT EXISTS video_details (
video_id TEXT PRIMARY KEY,
title TEXT NOT NULL,
upload_date TEXT,
duration INTEGER,
channel_name TEXT
);

-- Create table for storing transcript data
CREATE TABLE IF NOT EXISTS transcripts (
video_id TEXT PRIMARY KEY,
transcript TEXT NOT NULL,
FOREIGN KEY (video_id) REFERENCES video_details (video_id)
);

-- Add necessary indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_video_details_upload_date ON video_details (upload_date);
CREATE INDEX IF NOT EXISTS idx_transcripts_video_id ON transcripts (video_id);
27 changes: 27 additions & 0 deletions src/transcripts/generate_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from yt_dlp import YoutubeDL
from datetime import datetime
import sqlite3

from transcripts.utils import (
transcripts_dir,
Expand Down Expand Up @@ -587,6 +588,32 @@ def generate_transcript_page(video_id):
logging.info(f"Transcript page generated at {output_file}")


def connect_to_db():
try:
conn = sqlite3.connect(data_dir / "transcripts.db")
return conn
except sqlite3.Error as e:
logging.error(f"Error connecting to database: {e}")
return None


def fetch_transcript_from_db(video_id):
conn = connect_to_db()
if not conn:
return None

try:
cursor = conn.cursor()
cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,))
result = cursor.fetchone()
return result[0] if result else None
except sqlite3.Error as e:
logging.error(f"Error fetching transcript from database: {e}")
return None
finally:
conn.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate HTML from YouTube transcript"
Expand Down
38 changes: 38 additions & 0 deletions src/transcripts/transcribe_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import re
import ffmpeg
import logging
import sqlite3

from transcripts.utils import (
transcripts_dir,
model_dir,
whispercpp_dir,
timeit,
data_dir,
)

log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
Expand Down Expand Up @@ -166,6 +168,33 @@ def clean_up_wav_files(base_filename):
logging.error(f"Error during file cleanup: {e}")


def connect_to_db():
try:
conn = sqlite3.connect(data_dir / "transcripts.db")
return conn
except sqlite3.Error as e:
logging.error(f"Error connecting to database: {e}")
return None


def insert_transcript_to_db(video_id, transcript):
conn = connect_to_db()
if not conn:
return

try:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
(video_id, transcript),
)
conn.commit()
except sqlite3.Error as e:
logging.error(f"Error inserting transcript into database: {e}")
finally:
conn.close()


def main():
parser = argparse.ArgumentParser(
description="Automate downloading and processing audio from YouTube/Vimeo."
Expand All @@ -191,6 +220,15 @@ def main():
logging.info("Run whisper.cpp using Metal")
run_whisper(args.name, args.model_name)

# Read the generated transcript
transcript_path = transcripts_dir / f"{args.name}.txt"
try:
with open(transcript_path, "r", encoding="utf-8") as f:
transcript = f.read()
insert_transcript_to_db(args.name, transcript)
except Exception as e:
logging.error(f"Error reading transcript file: {e}")

# Clean up WAV files after processing
# clean_up_wav_files(wav_file_name)

Expand Down
44 changes: 44 additions & 0 deletions src/transcripts/transcribe_new_videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import os
import sqlite3
from transcripts.utils import data_dir, configs_dir
from pathlib import Path

Expand Down Expand Up @@ -70,6 +71,33 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path):
f.write("")


def connect_to_db():
try:
conn = sqlite3.connect(data_dir / "transcripts.db")
return conn
except sqlite3.Error as e:
logging.error(f"Error connecting to database: {e}")
return None


def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name):
conn = connect_to_db()
if not conn:
return

try:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)",
(video_id, title, upload_date, duration, channel_name),
)
conn.commit()
except sqlite3.Error as e:
logging.error(f"Error inserting video details into database: {e}")
finally:
conn.close()


# Download other videos (manual list)
# Step 2: Load existing video ids from the done file
other_done_ids = load_existing_ids("other_ids_done.txt")
Expand Down Expand Up @@ -159,6 +187,22 @@ def update_video_files(new_ids, existing_ids, done_file_path, in_file_path):
# Transcribe new videos
subprocess.run(["./bash_transcribe.sh", f"./data/{channel['in_file']}"])

# Insert new video details into the database
for vid in new_ids:
# Fetch video details using yt-dlp
cmd = f"yt-dlp -j -- {vid}"
result = subprocess.run(cmd, stdout=subprocess.PIPE, shell=True, text=True)
video_details = json.loads(result.stdout.strip())

# Extract relevant details
title = video_details.get("title", "")
upload_date = video_details.get("upload_date", "")
duration = video_details.get("duration", 0)
channel_name = channel["name"]

# Insert video details into the database
insert_video_details_to_db(vid, title, upload_date, duration, channel_name)

# Generate html and output to docs/
subprocess.run(["python", "src/transcripts/generate_html.py"])

Expand Down
38 changes: 38 additions & 0 deletions src/transcripts/transcribe_youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import re
import ffmpeg
import logging
import sqlite3

from transcripts.utils import (
transcripts_dir,
model_dir,
whispercpp_dir,
timeit,
data_dir,
)

log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
Expand Down Expand Up @@ -164,6 +166,33 @@ def clean_up_wav_files(base_filename):
logging.error(f"Error during file cleanup: {e}")


def connect_to_db():
try:
conn = sqlite3.connect(data_dir / "transcripts.db")
return conn
except sqlite3.Error as e:
logging.error(f"Error connecting to database: {e}")
return None


def insert_transcript_to_db(video_id, transcript):
conn = connect_to_db()
if not conn:
return

try:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
(video_id, transcript),
)
conn.commit()
except sqlite3.Error as e:
logging.error(f"Error inserting transcript into database: {e}")
finally:
conn.close()


def main():
parser = argparse.ArgumentParser(
description="Automate downloading and processing audio from YouTube/Vimeo."
Expand Down Expand Up @@ -191,6 +220,15 @@ def main():
logging.info("Run whisper.cpp using Metal")
run_whisper(wav_file_name, args.model_name)

# Read the generated transcript
transcript_path = transcripts_dir / f"{wav_file_name}.txt"
try:
with open(transcript_path, "r", encoding="utf-8") as f:
transcript = f.read()
insert_transcript_to_db(wav_file_name, transcript)
except Exception as e:
logging.error(f"Error reading transcript file: {e}")

# Clean up WAV files after processing
clean_up_wav_files(wav_file_name)

Expand Down
63 changes: 63 additions & 0 deletions src/transcripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import pandas as pd
from datetime import datetime
import sqlite3

project_dir = Path(__file__).resolve().parents[2]
data_dir = project_dir / "data"
Expand Down Expand Up @@ -79,3 +80,65 @@ def extract_yt_id(url):
else:
# Return None or an appropriate message if no video ID is found
return None


def connect_to_db():
try:
conn = sqlite3.connect(data_dir / "transcripts.db")
return conn
except sqlite3.Error as e:
logging.error(f"Error connecting to database: {e}")
return None


def fetch_transcript_from_db(video_id):
conn = connect_to_db()
if not conn:
return None

try:
cursor = conn.cursor()
cursor.execute("SELECT transcript FROM transcripts WHERE video_id=?", (video_id,))
result = cursor.fetchone()
return result[0] if result else None
except sqlite3.Error as e:
logging.error(f"Error fetching transcript from database: {e}")
return None
finally:
conn.close()


def insert_transcript_to_db(video_id, transcript):
conn = connect_to_db()
if not conn:
return

try:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO transcripts (video_id, transcript) VALUES (?, ?)",
(video_id, transcript),
)
conn.commit()
except sqlite3.Error as e:
logging.error(f"Error inserting transcript into database: {e}")
finally:
conn.close()


def insert_video_details_to_db(video_id, title, upload_date, duration, channel_name):
conn = connect_to_db()
if not conn:
return

try:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO video_details (video_id, title, upload_date, duration, channel_name) VALUES (?, ?, ?, ?, ?)",
(video_id, title, upload_date, duration, channel_name),
)
conn.commit()
except sqlite3.Error as e:
logging.error(f"Error inserting video details into database: {e}")
finally:
conn.close()