Skip to content

Continuous recognition from microphone #3

@mdewing

Description

@mdewing

Some sample code that does continuous recognition from a microphone. I can make a pull request if you're interested.

The intents are longer than in openWakeWord so alternating between recording and detection will not work very well. This code uses the callback mode of PyAudio to put the input chunks in a queue and then keeps a buffer that is two seconds in length for the detection.

# Continuous recognition from a microphone

import pyaudio
import numpy as np
import time
import queue

from openspeechtointent.model import CitrinetModel

# Load model (this will also download the model if it is not already present)
mdl = CitrinetModel()

# Define some simple intents
intents = [
    "turn on the lights",
    "turn off the lights",
    "pause the music",
    "set a 5 minute timer",
    "set a 10 minute timer",
    "remind me to buy apples tomorrow",
    "remind me to buy pears tomorrow",
]


FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1280

# 2 second buffer
nchunk = 25

audio_interface = pyaudio.PyAudio()

class MicrophoneBuffer:
    def __init__(self, nchunk):
        self.nchunk = nchunk
        buf_len = nchunk * CHUNK
        self.buffer = np.zeros(buf_len, dtype=np.int16)
        # Might need this to avoid some checks for an empty buffer
        # self.buffer[:] = 2
        self.idx = 0
        self.buffer_queue = queue.Queue()
        self.mic_stream = audio_interface.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK,
            stream_callback=self.callback,
        )

    # Using PyAudio callback mode, this function is called in a separate thread
    def callback(self, in_data, frame_count, time_info, status):
        self.buffer_queue.put(in_data)
        return (None, pyaudio.paContinue)

    def get(self):
        while not self.buffer_queue.empty():
            audio_data = np.frombuffer(self.buffer_queue.get(), dtype=np.int16)
            # Initial filling of the audio content buffer
            if self.idx < self.nchunk:
                loc = self.idx * CHUNK
                buffer[loc : loc + CHUNK] = audio_data[:]
                self.idx += 1
            else:
                self.buffer = np.roll(self.buffer, -CHUNK)
                self.buffer[-CHUNK - 1 : -1] = audio_data[:]
        return self.buffer

    def reset(self):
        self.buffer[:] = 0
        self.idx = 0


mic = MicrophoneBuffer(nchunk)

while True:
    buffer = mic.get()

    ts3 = time.perf_counter_ns()
    matched_intents, scores, durations = mdl.match_intents(
        buffer, intents, softmax_scores=True
    )
    ts4 = time.perf_counter_ns()

    for intent, score, duration in zip(matched_intents, scores, durations):
        if score > 0.45:
            print(f"Intent: {intent}, Score: {score}, Duration: {duration}")
            print("match time (ms)", (ts4 - ts3) / 1e6)
            # Clear the buffer after a match
            mic.reset()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions