[Feature] 调整VAD工作流程,规范VAD产出数据规范为 models/audiobinary中的AudioBinary_Chunk;完整测试LogicTrager VAD online流程。
This commit is contained in:
parent
8b69ff195f
commit
f7138dcb39
4
src/functor/__init__.py
Normal file
4
src/functor/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from .vad_functor import VAD
|
||||||
|
from .model_loader import load_models
|
||||||
|
|
||||||
|
__all__ = ["VAD", "load_models"]
|
@ -7,6 +7,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import logging
|
import logging
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
from src.models import AudioBinary_Config
|
||||||
|
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -20,22 +21,19 @@ class AudioChunk:
|
|||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
max_duration_ms: int = 1000*60*60*10,
|
max_duration_ms: int = 1000*60*60*10,
|
||||||
sample_rate: int = 16000,
|
audio_config : AudioBinary_Config = None,
|
||||||
sample_width: int = 2,
|
):
|
||||||
channels: int = 1):
|
|
||||||
"""
|
"""
|
||||||
初始化音频数据块管理器
|
初始化音频数据块管理器
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
max_duration_ms: 音频池最大留存时间(ms),默认10小时
|
max_duration_ms: 音频池最大留存时间(ms),默认10小时
|
||||||
sample_rate: 采样率,默认16KHz
|
audio_config: 音频配置信息
|
||||||
sample_width: 采样位宽,默认16bit
|
|
||||||
channels: 通道数,默认1
|
|
||||||
"""
|
"""
|
||||||
# 音频参数
|
# 音频参数
|
||||||
self.sample_rate = sample_rate # 采样率:16KHz
|
self.sample_rate = audio_config.sample_rate if audio_config is not None else 16000 # 采样率:16KHz
|
||||||
self.sample_width = sample_width # 采样位宽:16bit
|
self.sample_width = audio_config.sample_width if audio_config is not None else 2 # 采样位宽:16bit
|
||||||
self.channels = channels # 通道数:单声道
|
self.channels = audio_config.channels if audio_config is not None else 1 # 通道数:单声道
|
||||||
|
|
||||||
# 数据存储
|
# 数据存储
|
||||||
self._max_duration_ms = max_duration_ms
|
self._max_duration_ms = max_duration_ms
|
||||||
@ -84,6 +82,18 @@ class AudioChunk:
|
|||||||
logger.error(f"添加音频数据块时出错: {e}")
|
logger.error(f"添加音频数据块时出错: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_chunk_binary(self, start: int = 0, end: Optional[int] = None) -> Optional[bytes]:
|
||||||
|
"""
|
||||||
|
获取指定索引的音频数据块
|
||||||
|
"""
|
||||||
|
print("[AudioChunk] get_chunk_binary", start, end)
|
||||||
|
if start >= len(self._chunk):
|
||||||
|
return None
|
||||||
|
if end is None or end > len(self._chunk):
|
||||||
|
end = len(self._chunk)
|
||||||
|
data = b''.join(self._chunk)
|
||||||
|
return data[start:end]
|
||||||
|
|
||||||
def get_chunk(self, start_ms: int = 0, end_ms: Optional[int] = None) -> Optional[bytes]:
|
def get_chunk(self, start_ms: int = 0, end_ms: Optional[int] = None) -> Optional[bytes]:
|
||||||
"""
|
"""
|
||||||
获取指定时间范围的音频数据
|
获取指定时间范围的音频数据
|
@ -35,6 +35,7 @@ def load_models(args):
|
|||||||
device=args.device,
|
device=args.device,
|
||||||
disable_pbar=True,
|
disable_pbar=True,
|
||||||
disable_log=True,
|
disable_log=True,
|
||||||
|
disable_update=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2. 加载在线ASR模型
|
# 2. 加载在线ASR模型
|
||||||
@ -47,6 +48,7 @@ def load_models(args):
|
|||||||
device=args.device,
|
device=args.device,
|
||||||
disable_pbar=True,
|
disable_pbar=True,
|
||||||
disable_log=True,
|
disable_log=True,
|
||||||
|
disable_update=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 3. 加载VAD模型
|
# 3. 加载VAD模型
|
||||||
@ -59,6 +61,7 @@ def load_models(args):
|
|||||||
device=args.device,
|
device=args.device,
|
||||||
disable_pbar=True,
|
disable_pbar=True,
|
||||||
disable_log=True,
|
disable_log=True,
|
||||||
|
disable_update=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. 加载标点符号模型(如果指定)
|
# 4. 加载标点符号模型(如果指定)
|
||||||
@ -72,6 +75,7 @@ def load_models(args):
|
|||||||
device=args.device,
|
device=args.device,
|
||||||
disable_pbar=True,
|
disable_pbar=True,
|
||||||
disable_log=True,
|
disable_log=True,
|
||||||
|
disable_update=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
models["punc"] = None
|
models["punc"] = None
|
60
src/functor/vad_functor.py
Normal file
60
src/functor/vad_functor.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from funasr import AutoModel
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from src.models import VADResponse
|
||||||
|
from src.models import AudioBinary_Config
|
||||||
|
from src.functor.audiochunk import AudioChunk
|
||||||
|
from src.models import AudioBinary_Chunk
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
class VAD:
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
VAD_model = None,
|
||||||
|
audio_config : AudioBinary_Config = None,
|
||||||
|
callback: Callable = None,
|
||||||
|
):
|
||||||
|
# vad model
|
||||||
|
self.VAD_model = VAD_model
|
||||||
|
if self.VAD_model is None:
|
||||||
|
self.VAD_model = AutoModel(model="fsmn-vad", model_revision="v2.0.4", disable_update=True)
|
||||||
|
# audio config
|
||||||
|
self.audio_config = audio_config
|
||||||
|
# vad result
|
||||||
|
self.vad_result = VADResponse(time_chunk_index_callback=callback)
|
||||||
|
# audio binary poll
|
||||||
|
self.audio_chunk = AudioChunk(
|
||||||
|
audio_config=self.audio_config
|
||||||
|
)
|
||||||
|
self.cache = {}
|
||||||
|
|
||||||
|
def push_binary_data(self,
|
||||||
|
binary_data: bytes,
|
||||||
|
):
|
||||||
|
# 压入二进制数据
|
||||||
|
self.audio_chunk.add_chunk(binary_data)
|
||||||
|
# 处理音频块
|
||||||
|
res = self.VAD_model.generate(input=binary_data,
|
||||||
|
cache=self.cache,
|
||||||
|
chunk_size=self.audio_config.chunk_size,
|
||||||
|
is_final=False)
|
||||||
|
# print("VAD generate", res)
|
||||||
|
if len(res[0]["value"]):
|
||||||
|
self.vad_result += VADResponse.from_raw(res)
|
||||||
|
|
||||||
|
def set_callback(self,
|
||||||
|
callback: Callable,
|
||||||
|
):
|
||||||
|
self.vad_result.time_chunk_index_callback = callback
|
||||||
|
|
||||||
|
def process_vad_result(self, callback: Callable = None):
|
||||||
|
# 处理VAD结果
|
||||||
|
callback = callback if callback is not None else self.vad_result.time_chunk_index_callback
|
||||||
|
self.vad_result.process_time_chunk(
|
||||||
|
lambda x : callback(
|
||||||
|
AudioBinary_Chunk(
|
||||||
|
start_time=x["start_time"],
|
||||||
|
end_time=x["end_time"],
|
||||||
|
chunk=self.audio_chunk.get_chunk(x["start_time"], x["end_time"])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
@ -5,7 +5,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, Dict, Type
|
from typing import Any, Dict, Type, Callable
|
||||||
|
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -88,57 +88,62 @@ class AutoAfterMeta(type):
|
|||||||
5->6 __after__push_result_queue 调用回调函数
|
5->6 __after__push_result_queue 调用回调函数
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from src.functor import VAD
|
||||||
|
from src.models import AudioBinary_Config
|
||||||
|
from src.models import AudioBinary_Chunk
|
||||||
|
from typing import List
|
||||||
|
|
||||||
class LogicTrager(metaclass=AutoAfterMeta):
|
class LogicTrager(metaclass=AutoAfterMeta):
|
||||||
"""逻辑触发器类"""
|
"""逻辑触发器类"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
audio_chunk_max_size: int = 1024 * 1024 * 10,
|
audio_chunk_max_size: int = 1024 * 1024 * 10,
|
||||||
sample_rate: int = 16000,
|
audio_config: AudioBinary_Config = None,
|
||||||
channels: int = 1,
|
result_callback: Callable = None,
|
||||||
on_result_callback: Callable = None,
|
models: Dict[str, Any] = None,
|
||||||
):
|
):
|
||||||
"""初始化"""
|
"""初始化"""
|
||||||
# 存储音频块
|
# 存储音频块
|
||||||
self._audio_chunk = []
|
self._audio_chunk : List[AudioBinary_Chunk] = []
|
||||||
# 存储二进制数据
|
# 存储二进制数据
|
||||||
self._audio_chunk_binary = b''
|
self._audio_chunk_binary = b''
|
||||||
self._audio_chunk_max_size = audio_chunk_max_size
|
self._audio_chunk_max_size = audio_chunk_max_size
|
||||||
# 音频参数
|
# 音频参数
|
||||||
self._sample_rate = sample_rate
|
self._audio_config = audio_config if audio_config is not None else AudioBinary_Config()
|
||||||
self._channels = channels
|
|
||||||
# 结果队列
|
# 结果队列
|
||||||
self._result_queue = []
|
self._result_queue = []
|
||||||
# 回调函数
|
# 聚合结果回调函数
|
||||||
self._on_result_callback = on_result_callback
|
self._aggregate_result_callback = result_callback
|
||||||
|
# 组件
|
||||||
|
self._vad = VAD(VAD_model = models.get("vad"), audio_config = self._audio_config)
|
||||||
|
self._vad.set_callback(self.push_audio_chunk)
|
||||||
|
|
||||||
|
|
||||||
logger.info("初始化LogicTrager")
|
logger.info("初始化LogicTrager")
|
||||||
|
|
||||||
def push_binary_data(self, chunk: bytes) -> None:
|
def push_binary_data(self, chunk: bytes) -> None:
|
||||||
"""
|
"""
|
||||||
添加音频块
|
压入音频块至VAD模块
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
chunk: 音频数据块
|
chunk: 音频数据块
|
||||||
"""
|
"""
|
||||||
if self._audio_chunk is None:
|
# print("LogicTrager push_binary_data", len(chunk))
|
||||||
logger.error("AudioChunk未初始化")
|
self._vad.push_binary_data(chunk)
|
||||||
return
|
self.__after__push_binary_data()
|
||||||
|
|
||||||
self._audio_chunk_binary += chunk
|
|
||||||
logger.debug(f"添加音频块,大小: {len(chunk)}字节")
|
|
||||||
|
|
||||||
def __after__push_binary_data(self) -> None:
|
def __after__push_binary_data(self) -> None:
|
||||||
"""
|
"""
|
||||||
添加音频块后处理
|
添加音频块后处理
|
||||||
VAD检测,将检测到的VAD压入音频块
|
|
||||||
"""
|
"""
|
||||||
# VAD检测
|
# print("LogicTrager __after__push_binary_data")
|
||||||
pass
|
self._vad.process_vad_result()
|
||||||
# 压入音频块 push_audio_chunk
|
|
||||||
|
|
||||||
def push_audio_chunk(self, chunk: bytes) -> None:
|
def push_audio_chunk(self, chunk: AudioBinary_Chunk) -> None:
|
||||||
"""
|
"""
|
||||||
压入音频块
|
音频处理
|
||||||
"""
|
"""
|
||||||
|
print("LogicTrager push_audio_chunk [{}ms:{}ms] (len={})".format(chunk.start_time, chunk.end_time, len(chunk.chunk)))
|
||||||
self._audio_chunk.append(chunk)
|
self._audio_chunk.append(chunk)
|
||||||
|
|
||||||
def __after__push_audio_chunk(self) -> None:
|
def __after__push_audio_chunk(self) -> None:
|
||||||
|
3
src/models/__init__.py
Normal file
3
src/models/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .audiobinary import AudioBinary_Config, AudioBinary_Chunk
|
||||||
|
from .vad import VADResponse
|
||||||
|
__all__ = ["AudioBinary_Config", "AudioBinary_Chunk", "VADResponse"]
|
16
src/models/audiobinary.py
Normal file
16
src/models/audiobinary.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
class AudioBinary_Config(BaseModel):
|
||||||
|
"""二进制音频块配置信息"""
|
||||||
|
audio_data: bytes = Field(description="音频数据", default=None)
|
||||||
|
chunk_size: int = Field(description="块大小", default=100)
|
||||||
|
chunk_stride: int = Field(description="块步长", default=1600)
|
||||||
|
sample_rate: int = Field(description="采样率", default=16000)
|
||||||
|
sample_width: int = Field(description="采样位宽", default=2)
|
||||||
|
channels: int = Field(description="通道数", default=1)
|
||||||
|
|
||||||
|
class AudioBinary_Chunk(BaseModel):
|
||||||
|
"""音频块"""
|
||||||
|
start_time: int = Field(description="开始时间", default=0)
|
||||||
|
end_time: int = Field(description="结束时间", default=0)
|
||||||
|
chunk: bytes = Field(description="音频块", default=None)
|
@ -38,11 +38,16 @@ class VADResponse(BaseModel):
|
|||||||
"""处理时间块"""
|
"""处理时间块"""
|
||||||
# print("Enter process_time_chunk", self.time_chunk_index, len(self.time_chunk))
|
# print("Enter process_time_chunk", self.time_chunk_index, len(self.time_chunk))
|
||||||
while self.time_chunk_index < len(self.time_chunk) - 1:
|
while self.time_chunk_index < len(self.time_chunk) - 1:
|
||||||
if self.time_chunk[self.time_chunk_index].end != -1:
|
index = self.time_chunk_index
|
||||||
|
if self.time_chunk[index].end != -1:
|
||||||
|
x = {
|
||||||
|
"start_time": self.time_chunk[index].start,
|
||||||
|
"end_time": self.time_chunk[index].end
|
||||||
|
}
|
||||||
if callback is not None:
|
if callback is not None:
|
||||||
callback(self.time_chunk_index)
|
callback(x)
|
||||||
elif self.time_chunk_index_callback is not None:
|
elif self.time_chunk_index_callback is not None:
|
||||||
self.time_chunk_index_callback(self.time_chunk_index)
|
self.time_chunk_index_callback(x)
|
||||||
else:
|
else:
|
||||||
print("[Warning] No callback available")
|
print("[Warning] No callback available")
|
||||||
self.time_chunk_index += 1
|
self.time_chunk_index += 1
|
||||||
@ -125,3 +130,14 @@ class VADResponse(BaseModel):
|
|||||||
result_str += f"[{value_item.start}:{value_item.end}]\n"
|
result_str += f"[{value_item.start}:{value_item.end}]\n"
|
||||||
return result_str
|
return result_str
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.time_chunk)
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
return next(self.time_chunk)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.time_chunk)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self.time_chunk[index]
|
@ -1,4 +1,4 @@
|
|||||||
from tests.modelsuse import vad_model_use_online
|
from tests.modelsuse import vad_model_use_online_logic
|
||||||
|
|
||||||
vad_result = vad_model_use_online("tests/vad_example.wav")
|
vad_result = vad_model_use_online_logic("tests/vad_example.wav")
|
||||||
print(vad_result)
|
print(vad_result)
|
@ -1,6 +1,6 @@
|
|||||||
from funasr import AutoModel
|
from funasr import AutoModel
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
from src.pydantic_models import VADResponse
|
from src.models import VADResponse
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]:
|
def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]:
|
||||||
@ -32,6 +32,36 @@ def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]:
|
|||||||
# print(item)
|
# print(item)
|
||||||
return vad_result
|
return vad_result
|
||||||
|
|
||||||
|
def vad_model_use_online_logic(file_path: str) -> List[Dict[str, Any]]:
|
||||||
|
from src.logic_trager import LogicTrager
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
from src.config import parse_args
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
from src.functor.model_loader import load_models
|
||||||
|
models = load_models(args)
|
||||||
|
|
||||||
|
chunk_size = 200 # ms
|
||||||
|
from src.models import AudioBinary_Config
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
speech, sample_rate = soundfile.read(file_path)
|
||||||
|
chunk_stride = int(chunk_size * sample_rate / 1000)
|
||||||
|
audio_config = AudioBinary_Config(sample_rate=sample_rate, sample_width=2, channels=1, chunk_size=chunk_size)
|
||||||
|
|
||||||
|
logic_trager = LogicTrager(models=models, audio_config=audio_config)
|
||||||
|
for i in range(len(speech)//chunk_stride+1):
|
||||||
|
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
||||||
|
logic_trager.push_binary_data(speech_chunk)
|
||||||
|
|
||||||
|
# for item in items:
|
||||||
|
# print(item)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
vad_result = vad_model_use_online("tests/vad_example.wav")
|
# vad_result = vad_model_use_online("tests/vad_example.wav")
|
||||||
|
vad_result = vad_model_use_online_logic("tests/vad_example.wav")
|
||||||
# print(vad_result)
|
# print(vad_result)
|
Loading…
x
Reference in New Issue
Block a user