STT_Server/src/functor/vad_functor.py

60 lines
2.0 KiB
Python

from funasr import AutoModel
from typing import List, Dict, Any
from src.models import VADResponse
from src.models import AudioBinary_Config
from src.functor.audiochunk import AudioChunk
from src.models import AudioBinary_Chunk
from typing import Callable
class VAD:
def __init__(self,
VAD_model = None,
audio_config : AudioBinary_Config = None,
callback: Callable = None,
):
# vad model
self.VAD_model = VAD_model
if self.VAD_model is None:
self.VAD_model = AutoModel(model="fsmn-vad", model_revision="v2.0.4", disable_update=True)
# audio config
self.audio_config = audio_config
# vad result
self.vad_result = VADResponse(time_chunk_index_callback=callback)
# audio binary poll
self.audio_chunk = AudioChunk(
audio_config=self.audio_config
)
self.cache = {}
def push_binary_data(self,
binary_data: bytes,
):
# 压入二进制数据
self.audio_chunk.add_chunk(binary_data)
# 处理音频块
res = self.VAD_model.generate(input=binary_data,
cache=self.cache,
chunk_size=self.audio_config.chunk_size,
is_final=False)
# print("VAD generate", res)
if len(res[0]["value"]):
self.vad_result += VADResponse.from_raw(res)
def set_callback(self,
callback: Callable,
):
self.vad_result.time_chunk_index_callback = callback
def process_vad_result(self, callback: Callable = None):
# 处理VAD结果
callback = callback if callback is not None else self.vad_result.time_chunk_index_callback
self.vad_result.process_time_chunk(
lambda x : callback(
AudioBinary_Chunk(
start_time=x["start_time"],
end_time=x["end_time"],
chunk=self.audio_chunk.get_chunk(x["start_time"], x["end_time"])
)
)
)