STT_Server/tests/modelsuse.py

84 lines
2.9 KiB
Python

from funasr import AutoModel
from typing import List, Dict, Any
from src.models import VADResponse
import time
def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]:
chunk_size = 100 # ms
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4", disable_update=True)
vad_result = VADResponse()
vad_result.time_chunk_index_callback = lambda index: print(f"回调: {index}")
items = []
import soundfile
speech, sample_rate = soundfile.read(file_path)
chunk_stride = int(chunk_size * sample_rate / 1000)
cache = {}
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
for i in range(total_chunk_num):
time.sleep(0.1)
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
is_final = i == total_chunk_num - 1
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
if len(res[0]["value"]):
vad_result += VADResponse.from_raw(res)
for item in res[0]["value"]:
items.append(item)
vad_result.process_time_chunk()
# for item in items:
# print(item)
return vad_result
def vad_model_use_online_logic(file_path: str) -> List[Dict[str, Any]]:
from src.logic_trager import LogicTrager
import soundfile
from src.config import parse_args
args = parse_args()
from src.functor.model_loader import load_models
models = load_models(args)
chunk_size = 200 # ms
from src.models import AudioBinary_Config
import soundfile
speech, sample_rate = soundfile.read(file_path)
chunk_stride = int(chunk_size * sample_rate / 1000)
audio_config = AudioBinary_Config(sample_rate=sample_rate, sample_width=2, channels=1, chunk_size=chunk_size)
logic_trager = LogicTrager(models=models, audio_config=audio_config)
for i in range(len(speech)//chunk_stride+1):
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
logic_trager.push_binary_data(speech_chunk)
# for item in items:
# print(item)
return None
def asr_model_use_offline(file_path: str) -> List[Dict[str, Any]]:
from funasr import AutoModel
model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
vad_model="fsmn-vad", vad_model_revision="v2.0.4",
# punc_model="ct-punc-c", punc_model_revision="v2.0.4",
spk_model="cam++", spk_model_revision="v2.0.2",
spk_mode="vad_segment",
auto_update=False,
)
import soundfile
from src.models import AudioBinary_Config
import soundfile
speech, sample_rate = soundfile.read(file_path)
result = model.generate(speech)
return result
if __name__ == "__main__":
# vad_result = vad_model_use_online("tests/vad_example.wav")
vad_result = vad_model_use_online_logic("tests/vad_example.wav")
# print(vad_result)