You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have a HTML frontend which sends the audio chunks to the FastAPI based backend over websocket in byte form but after decoding and transcribing the result is always empty string. Can somebody help me here?
from venv import logger
from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from typing import Union
import re
import numpy as np
from faster_whisper import WhisperModel
model_size = "base.en"
app = FastAPI()
class ChatModel(BaseModel):
messageType: str
message: Union[str, bytes, None] = None
history: list[str] = []
model = WhisperModel(model_size, device="cpu", compute_type="int8")
logger.setLevel(10) # for debugging
# Use the static/index.html
html = open("static/index.html").read()
@app.get("/")
async def get():
return HTMLResponse(html)
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
history = list[str]()
audioBuffer = list[bytes]()
while True:
data = await websocket.receive_json()
# print(data)
if isinstance(data, dict): # "EOA" message
chat = ChatModel(
history=history,
message=data["message"],
messageType=data["messageType"],
)
if chat.messageType == "text":
history.append(chat.message)
chat.history = history
await websocket.send_text(
f"{chat.messageType}: {chat.message} \n {chat.history}"
)
elif chat.message == "EOA":
# buf = bytes(chat.message.encode("utf-8"))
# what if this takes so long the client gives up? should send incremental results,
# or a least pretend results, while we process
audio_data = b"".join(audioBuffer)
print(data)
print(
f"audio buffer had {len(data)} segments, final length {len(audio_data)}"
)
# Save to file
with open("audio.wav", "wb") as f:
f.write(audio_data)
audio_np = (
np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
/ 32768.0
)
segments, _ = model.transcribe(audio_np)
print(f"Info: {_}")
rr = []
for segment in segments:
print(segment.text)
rr.append(segment.text)
m = " ".join(rr)
# print("recognition result before strip:", m)
m = m.strip()
# sometimes whisper puts in commentary like [soft music] and we strip that out:
# print("recognition result before bracket re:", m)
m = re.sub(r"\[.*\]", "", m)
# print("recognition result after bracket re:", m)
# was having problems with things like "He said hello" which became He said, "Hello"
m = re.sub(r'"', '\\"', m) # convert " to \\"
# print("recognition result after quoting:", m)
# send json result to konele client:
msg = f'{{"status": 0, "result": {{"hypotheses": [{{"transcript": "{m}"}}], "final": true}}}}'
print("msg is", msg)
await websocket.send_text(msg)
else:
audioBuffer.append(chat.message.encode("utf-8"))
print(f"audio buf now has {len(audioBuffer)} segments")```
The text was updated successfully, but these errors were encountered:
I have a HTML frontend which sends the audio chunks to the FastAPI based backend over websocket in byte form but after decoding and transcribing the result is always empty string. Can somebody help me here?
HTML Frontend
FastAPI Backend
The text was updated successfully, but these errors were encountered: