label-speakers-lambda/app.py at main · musicsnobj/label-speakers-lambda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
import re
import heapq
import urllib.parse
from botocore.config import Config
import boto3
from collections import defaultdict
from pydantic import BaseModel, Field
import instructor
from anthropic import AnthropicBedrock

class SpeakerConfidence(BaseModel):
    reasoning: str
    speaker_A_jess_confidence: int = Field(ge=0, le=100)
    speaker_A_jack_confidence: int = Field(ge=0, le=100)

s3 = boto3.client("s3")


bedrock_client = AnthropicBedrock(
    aws_region="us-east-1",
)
client = instructor.from_anthropic(bedrock_client)

OUTPUT_BUCKET = os.environ.get("OUTPUT_BUCKET")
MODEL_ID = os.environ.get("BEDROCK_MODEL_ID")
DATE_RE = re.compile(r"_(\d{2}-\d{2}-\d{4})")

def lambda_handler(event, context):
    record = event["Records"][0]
    src_bucket = record["s3"]["bucket"]["name"]
    src_key = urllib.parse.unquote_plus(record["s3"]["object"]["key"])

    # Download formatted transcript
    obj = s3.get_object(Bucket=src_bucket, Key=src_key)
    transcript = obj["Body"].read().decode("utf-8")
    base_name = src_key.rsplit("/", 1)[-1]

    # deduce host labels (by speaker line counts)
    speaker_line_counts = count_lines_by_speaker(transcript)
    top_2_speaker_labels = tuple(
        speaker for speaker, _ in
        heapq.nlargest(2, speaker_line_counts.items(), key=lambda item: item[1])
    )
    if len(top_2_speaker_labels) != 2:
        raise RuntimeError(f"The file {src_key} contains fewer than two speakers")

    speaker_A_label = top_2_speaker_labels[0]
    speaker_B_label = top_2_speaker_labels[1]
    print(f"speaker A label: {speaker_A_label}")
    print(f"speaker B label: {speaker_B_label}")
    # strip out non-host speaker lines
    clean_transcript = remove_non_host_lines(transcript, speaker_A_label, speaker_B_label)
    # replace Transcribe labels with 'speaker_A' and 'speaker_B' (so LLM only deals with one set of labels)
    SPEAKER_MAP_1 = {
        speaker_A_label: "speaker_A",
        speaker_B_label: "speaker_B",
    }
    for old, new in SPEAKER_MAP_1.items():
        clean_transcript = clean_transcript.replace(old, new)
    # split into chunks
    chunks = chunk_transcript_by_chars(clean_transcript, max_chunk_chars=12000)
    print(f"Splitting transcript into {len(chunks)} chunks")

    # keep tally of claude's confidence re: which speaker label belongs to which host
    spk_A_jess_total = 0
    spk_A_jack_total = 0
    for i in range(len(chunks)):
        scores = get_speaker_label_confidence_scores(chunks[i])
        print(f"chunk {i} scores: {scores}")
        spk_A_jess_total += scores.speaker_A_jess_confidence
        spk_A_jack_total += scores.speaker_A_jack_confidence
    print(f"confidence that {speaker_A_label} is Jess: {spk_A_jess_total}")
    print(f"confidence that {speaker_A_label} is Jack Brett: {spk_A_jack_total}")
    spk_A_is_jess = spk_A_jess_total >= spk_A_jack_total
    # now we know who's who. save transcript with corrected labels
    SPEAKER_MAP_2 = {
        speaker_A_label: "Jess" if spk_A_is_jess else "Jack Brett",
        speaker_B_label: "Jess" if not spk_A_is_jess else "Jack Brett",
    }
    for old, new in SPEAKER_MAP_2.items():
        transcript = transcript.replace(old, new)

    out_key = base_name  # or adjust if you want a different pattern
    s3.put_object(
        Bucket=OUTPUT_BUCKET,
        Key=out_key,
        Body=transcript.encode("utf-8"),
        ContentType="text/plain; charset=utf-8",
    )

    return {"output_bucket": OUTPUT_BUCKET, "output_key": out_key}

def count_lines_by_speaker(transcript: str) -> dict[str, int]:
    """
    Reads a podcast transcript file and returns a dict mapping
    speaker labels (e.g., 'spk_0') to the number of lines spoken.
    """
    # Regex to capture the speaker label at the start of each line
    speaker_pattern = re.compile(r"^(spk_\d+)\s+\(")
    speaker_counts: dict[str, int] = defaultdict(int)
    transcript_lines = transcript.split("\n")
    for line in transcript_lines:
        line = line.strip()
        if not line:
            continue

        match = speaker_pattern.match(line)
        if match:
            speaker = match.group(1)
            speaker_counts[speaker] += 1
    return dict(speaker_counts)

def remove_non_host_lines(transcript: str, speaker_A_label: str, speaker_B_label: str) -> str:
    """
    Return same transcript minus the lines not belonging to speaker A or speaker B
    """
    all_lines = transcript.split("\n")
    host_lines = []
    for line in all_lines:
        if line.startswith(speaker_A_label) or line.startswith(speaker_B_label):
            host_lines.append(line)
    return "\n".join(host_lines)


def get_speaker_label_confidence_scores(chunk_text: str) -> SpeakerConfidence:
    system_prompt = (
        "You are analyzing a chunk of a podcast transcript and assigning confidence scores for \n"
        "which speaker is which podcast co-host based on a set of rules.\n"
        "The podcast is titled: However Comma.\n"
        "The podcast stars two co-hosts and best friends: Jess and Jack Brett.\n"
        "The content of the podcast includes political discussion, personal updates in the lives of the co-hosts, "
        "strategy discussion around 'Zelda: Breath of the Wild' for Nintendo Switch, and discussions of the podcast itself.\n"
        "Speakers in the transcript are labeled 'speaker_A' and 'speaker_B'. 'speaker_A' is one of the co-hosts, and 'speaker_B' is the other.\n"
        "Each line also includes a timestamp (in hh:mm:ss format) in parentheses. The format of a single line is as follows:\n"
        "{speaker} ({timestamp}): {text}\n"
        "In other words, a line of text MUST be attributed to the speaker label (speaker_A or speaker_B) at the beginning of that line.\n"
        "An example line by speaker_B:\n"
        "speaker_B (00:31:16): And uh it'll and the theme like I said the opening theme it'll be short punchy it'll probably be that that third option the one that your favorite out of the three that I sent you uh just for now like it'll be made be uh I mean as I get better at uh recording this stuff it'll uh maybe we'll have a better one later\n"
        "\n"
        "Your task is to read the transcript chunk, and output a JSON object with three fields:\n"
        f"reasoning (string): a summary of the reasoning for your calculated scores\n"
        f"speaker_A_jess_confidence (integer, 0-100): your confidence based on the given transcript chunk that speaker_A is Jess and speaker_B is Jack Brett\n"
        f"speaker_A_jack_confidence (integer, 0-100): your confidence based on the given transcript chunk that speaker_A is Jack Brett and speaker_B is Jess\n"
        "The rules are as follows:\n"
        "A speaker is likely to be Jess if any of the following are true:\n"
        "- speaker refers to self as 'Jess' or 'Jessica', or other speaker refers to them as 'Jess' or 'Jessica'\n"
        "- speaker refers to other speaker as 'Jack Brett' or 'Jack (something)'\n"
        "- speaker mentions owning a house, or other speaker refers to them as owning a house\n"
        "- speaker mentions living in South Carolina, or other speaker refers to them as living in South Carolina\n"
        "- speaker mentions their husband Joe, or other speaker refers to them as having a husband named Joe\n"
        "- speaker expresses right-of-center political opinion, or other speaker refers to them as having right-of-center political opinions"
        "(especially a desire to shift more power from the federal to the state level)\n"
        "- speaker mentions being female, or other speaker refers to them as being female\n"
        "- speaker mentions being attracted to men, or other speaker refers to them as being attracted to men\n"
        "- speaker mentions being a professional video editor, or other speaker refers to them as a professional video editor\n"
        "- speaker mentions their job for the podcast is video editing, or other speaker refers to them as being in charge of editing video for the podcast\n"
        "- speaker mentions they work for a guy named Jeremy, or company Richter Studios, or other speaker refers to them as working for Jeremy/Richer Studios\n"
        "- speaker mentions their husband Joe or parents having previously been in the Navy (though Jess herself has not)\n"
        "- speaker mentions a desire for aliens to come take over Earth\n"
        "\n"
        "A speaker is likely to be Jack Brett if any of the following are true:\n"
        "- speaker refers to self as 'Jack Brett', or another speaker refers to them as 'Jack Brett' or 'Jack (something)'\n"
        "- speaker refers to the other speaker as 'Jess' or 'girl'\n"
        "- speaker expresses left-of-center political opinions, or other speaker refers to them as having left-of-center political opinions\n"
        "- speaker mentions something they heard on the \"fake news media\"\n"
        "- speaker shows a general disdain for Donald Trump, or is referred to as having Trump Derangement Syndrome (TDS)\n"
        "- speaker mentions being male, or other speaker refers to them as being male\n"
        "- speaker mentions being attracted to women, or other speaker refers to them as being attracted to women"
        " (especially women in tight pants or leggings)\n"
        "- speaker mentions living in in Chicago, or other speaker refers to them as living in Chicago\n"
        "- speaker mentions being a professional coder/developer, or other speaker refers to them as being a professional coder/developer\n"
        "- speaker mentions they play in a band\n"
        "- speaker mentions having been in the Navy, or other speaker refers to them as having been in the Navy\n"
        "- speaker mentions an agency they used to work for called PunchKick\n"
        "- speaker mentions their jobs for the podcast are drawing, recording music, sourcing clips and transcribing audio via AWS,"
        "or another speaker refers to them as being the one who draws, records music, sources clips or transcribes audio via AWS for the podcast\n"
        "- speaker mentions their weekly psychedelic \"space-time journey\", or other speaker refers to them as having a weekly psychedelic \"space-time journey\"\n"
        "- speaker mentions their current employer Dan Wolf (or company Wolfco), or other speaker refers to them as currently working for Dan Wolf (Wolfco)\n"
        "\n"
        "Ensure your scores are consistent with your reasoning.\n"
        "Output only the JSON object, parsable by Python `json.loads` (DO NOT WRAP in 'json```' or anything). Example response:\n"
        "{\"speaker_A_jess_confidence\": 21, \"speaker_A_jack_confidence\": 79, \"reasoning\": \"speaker_B mentions looking at pictures of girls in tight pants, which matches Jack Brett's known attraction to women in tight pants/leggings. speaker_B also expresses left-of-center political opinions by defending support staff salaries and criticizing the Trump administration's NIH funding cuts, showing concern for workers and skepticism of conservative framing. speaker_A presents right-of-center political opinions by defending Trump's NIH policy changes, criticizing mainstream media headlines as misleading, and suggesting people should be 'thanking Donald Trump'. speaker_A also refers to speaker_B with 'you' when discussing therapy, consistent with their established dynamic. The political positions and the tight pants reference strongly indicate speaker_B is Jack Brett and speaker_A is Jess.\"}"
    )

    return client.messages.create(
        model=MODEL_ID,
        response_model=SpeakerConfidence,
        max_tokens=500,
        temperature=0.2,
        system=system_prompt,
        messages=[
            {
                "role": "user",
                "content": f"Transcript chunk:\n{chunk_text}"
            }
        ]
    )

def chunk_transcript_by_chars(transcript: str, max_chunk_chars: int = 8000):
    """
    Split transcript into chunks that preserve whole lines and stay within
    a character budget suitable for a single LLM call.
    """
    lines = transcript.splitlines(keepends=True)
    chunks = []
    current = []
    current_len = 0

    for line in lines:
        line_len = len(line)
        # If adding this line would exceed the limit and we already have some content, start a new chunk
        if current and current_len + line_len > max_chunk_chars:
            chunks.append("".join(current))
            current = [line]
            current_len = line_len
        else:
            current.append(line)
            current_len += line_len

    if current:
        chunks.append("".join(current))

    return chunks