Support for audio buffers #169

frankh077 · 2024-03-07T23:04:59Z

I'm trying to process audio buffers that I receive from websocket and send them to whisper server via websocket, but I'm not able to achieve a coherent transcription.
I based it on the code from the chrome extension. this is my code:

`

    const WebSocket = require('ws');
    function generateUUID() {
        let dt = new Date().getTime();
        const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
        const r = (dt + Math.random() * 16) % 16 | 0;
        dt = Math.floor(dt / 16);
        return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16);
        });
        return uuid;
    }

    function resampleTo16kHZ(audioData, origSampleRate) {
        const data = new Float32Array(audioData);
        const targetLength = Math.round(data.length * (16000 / origSampleRate));
        const resampledData = new Float32Array(targetLength);
        const springFactor = (data.length - 1) / (targetLength - 1);
        resampledData[0] = data[0];
        resampledData[targetLength - 1] = data[data.length - 1];
    
        for (let i = 1; i < targetLength - 1; i++) {
        const index = i * springFactor;
        const leftIndex = Math.floor(index).toFixed();
        const rightIndex = Math.ceil(index).toFixed();
        const fraction = index - leftIndex;
        resampledData[i] = data[leftIndex] + (data[rightIndex] - data[leftIndex]) * fraction;
        }
    
        return resampledData;
    }

    class WhisperASRClient {
        constructor() {
            this.socket = null;
        }

        async setupASR() {
            const uuid = generateUUID();
            this.socket = new WebSocket(`ws://localhost:9090/`);
            this.socket.onopen = (e) => {
                this.socket.send(
                    JSON.stringify({
                        uid: uuid,
                        language: 'es',
                        task: 'transcribe',
                        model: 'small',
                        use_vad: true
                    })
                );
            };
        
            return new Promise((resolve, reject) => {
                this.socket.onmessage = async (event) => {
                    const data = JSON.parse(event.data);
                    
                    if (data["message"] === "SERVER_READY"){
                        resolve(true);
                    }
            
                };
            });
        }

        async mainASR(transcription_cb) {
            if (this.socket === null) {
                console.error('socket is null');
                return;
            }
            this.socket.onmessage = async (event) => {
                const data = JSON.parse(event.data);
                transcription_cb({
                    transcript: data["message"],
                    is_final: data["status"] === "FINAL"
                });
            };
            
        }

        writeAudioData(audioData) {
            const inputData = new Float32Array(audioData);
            const audioData16kHz = resampleTo16kHZ(inputData, 8000); 
            this.socket.send(audioData16kHz);

            this.socket.onmessage = async (event) => {
                const data = JSON.parse(event.data);

                if (data.segments) {
                    for (const segment of data.segments) {
                        console.log(segment.text);
                    }
                }
            };
        }
        end() {
            this.socket.close();
        }
    }

    module.exports = WhisperASRClient;

`

According to the extension code, the audio buffer must be converted to Float32Array and then resampled to send it by ws to the whisper server, but there is a compatibility problem with the audio sending that causes the transcriptions to not be processed correctly.

Any ideas on how I should send audio buffers to Whisper for correct processing?

The text was updated successfully, but these errors were encountered:

cjpais · 2024-03-21T16:24:02Z

a bit late, but the following react component works for me.

The whisper-live server accepts f32le PCM at 16000 sampling rate

https://gist.github.com/cjpais/6e6555125cac1527c8f3ddcce9d05644

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Support for audio buffers #169

Support for audio buffers #169

frankh077 commented Mar 7, 2024

cjpais commented Mar 21, 2024

Support for audio buffers #169

Support for audio buffers #169

Comments

frankh077 commented Mar 7, 2024

cjpais commented Mar 21, 2024