From f7138dcb39b138c15e1d32a2178b74936dc40bb5 Mon Sep 17 00:00:00 2001 From: Keeeer Date: Tue, 15 Apr 2025 17:15:13 +0800 Subject: [PATCH] =?UTF-8?q?[Feature]=20=E8=B0=83=E6=95=B4VAD=E5=B7=A5?= =?UTF-8?q?=E4=BD=9C=E6=B5=81=E7=A8=8B=EF=BC=8C=E8=A7=84=E8=8C=83VAD?= =?UTF-8?q?=E4=BA=A7=E5=87=BA=E6=95=B0=E6=8D=AE=E8=A7=84=E8=8C=83=E4=B8=BA?= =?UTF-8?q?=20models/audiobinary=E4=B8=AD=E7=9A=84AudioBinary=5FChunk?= =?UTF-8?q?=EF=BC=9B=E5=AE=8C=E6=95=B4=E6=B5=8B=E8=AF=95LogicTrager=20VAD?= =?UTF-8?q?=20online=E6=B5=81=E7=A8=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/functor/__init__.py | 4 ++ src/{ => functor}/audiochunk.py | 28 ++++++---- src/{models.py => functor/model_loader.py} | 4 ++ src/functor/vad_functor.py | 60 ++++++++++++++++++++++ src/logic_trager.py | 49 ++++++++++-------- src/models/__init__.py | 3 ++ src/models/audiobinary.py | 16 ++++++ src/{pydantic_models.py => models/vad.py} | 22 ++++++-- test_main.py | 4 +- tests/modelsuse.py | 34 +++++++++++- 10 files changed, 186 insertions(+), 38 deletions(-) create mode 100644 src/functor/__init__.py rename src/{ => functor}/audiochunk.py (84%) rename src/{models.py => functor/model_loader.py} (94%) create mode 100644 src/functor/vad_functor.py create mode 100644 src/models/__init__.py create mode 100644 src/models/audiobinary.py rename src/{pydantic_models.py => models/vad.py} (87%) diff --git a/src/functor/__init__.py b/src/functor/__init__.py new file mode 100644 index 0000000..5ecc5b1 --- /dev/null +++ b/src/functor/__init__.py @@ -0,0 +1,4 @@ +from .vad_functor import VAD +from .model_loader import load_models + +__all__ = ["VAD", "load_models"] \ No newline at end of file diff --git a/src/audiochunk.py b/src/functor/audiochunk.py similarity index 84% rename from src/audiochunk.py rename to src/functor/audiochunk.py index 971a269..f7405ac 100644 --- a/src/audiochunk.py +++ b/src/functor/audiochunk.py @@ -7,6 +7,7 @@ import numpy as np import logging from typing import List, Optional, Union +from src.models import AudioBinary_Config # 配置日志 logging.basicConfig( @@ -20,22 +21,19 @@ class AudioChunk: def __init__(self, max_duration_ms: int = 1000*60*60*10, - sample_rate: int = 16000, - sample_width: int = 2, - channels: int = 1): + audio_config : AudioBinary_Config = None, + ): """ 初始化音频数据块管理器 参数: max_duration_ms: 音频池最大留存时间(ms),默认10小时 - sample_rate: 采样率,默认16KHz - sample_width: 采样位宽,默认16bit - channels: 通道数,默认1 + audio_config: 音频配置信息 """ # 音频参数 - self.sample_rate = sample_rate # 采样率:16KHz - self.sample_width = sample_width # 采样位宽:16bit - self.channels = channels # 通道数:单声道 + self.sample_rate = audio_config.sample_rate if audio_config is not None else 16000 # 采样率:16KHz + self.sample_width = audio_config.sample_width if audio_config is not None else 2 # 采样位宽:16bit + self.channels = audio_config.channels if audio_config is not None else 1 # 通道数:单声道 # 数据存储 self._max_duration_ms = max_duration_ms @@ -84,6 +82,18 @@ class AudioChunk: logger.error(f"添加音频数据块时出错: {e}") return False + def get_chunk_binary(self, start: int = 0, end: Optional[int] = None) -> Optional[bytes]: + """ + 获取指定索引的音频数据块 + """ + print("[AudioChunk] get_chunk_binary", start, end) + if start >= len(self._chunk): + return None + if end is None or end > len(self._chunk): + end = len(self._chunk) + data = b''.join(self._chunk) + return data[start:end] + def get_chunk(self, start_ms: int = 0, end_ms: Optional[int] = None) -> Optional[bytes]: """ 获取指定时间范围的音频数据 diff --git a/src/models.py b/src/functor/model_loader.py similarity index 94% rename from src/models.py rename to src/functor/model_loader.py index 511116a..99f699d 100644 --- a/src/models.py +++ b/src/functor/model_loader.py @@ -35,6 +35,7 @@ def load_models(args): device=args.device, disable_pbar=True, disable_log=True, + disable_update=True, ) # 2. 加载在线ASR模型 @@ -47,6 +48,7 @@ def load_models(args): device=args.device, disable_pbar=True, disable_log=True, + disable_update=True, ) # 3. 加载VAD模型 @@ -59,6 +61,7 @@ def load_models(args): device=args.device, disable_pbar=True, disable_log=True, + disable_update=True, ) # 4. 加载标点符号模型(如果指定) @@ -72,6 +75,7 @@ def load_models(args): device=args.device, disable_pbar=True, disable_log=True, + disable_update=True, ) else: models["punc"] = None diff --git a/src/functor/vad_functor.py b/src/functor/vad_functor.py new file mode 100644 index 0000000..cc87040 --- /dev/null +++ b/src/functor/vad_functor.py @@ -0,0 +1,60 @@ +from funasr import AutoModel +from typing import List, Dict, Any +from src.models import VADResponse +from src.models import AudioBinary_Config +from src.functor.audiochunk import AudioChunk +from src.models import AudioBinary_Chunk +from typing import Callable + +class VAD: + + def __init__(self, + VAD_model = None, + audio_config : AudioBinary_Config = None, + callback: Callable = None, + ): + # vad model + self.VAD_model = VAD_model + if self.VAD_model is None: + self.VAD_model = AutoModel(model="fsmn-vad", model_revision="v2.0.4", disable_update=True) + # audio config + self.audio_config = audio_config + # vad result + self.vad_result = VADResponse(time_chunk_index_callback=callback) + # audio binary poll + self.audio_chunk = AudioChunk( + audio_config=self.audio_config + ) + self.cache = {} + + def push_binary_data(self, + binary_data: bytes, + ): + # 压入二进制数据 + self.audio_chunk.add_chunk(binary_data) + # 处理音频块 + res = self.VAD_model.generate(input=binary_data, + cache=self.cache, + chunk_size=self.audio_config.chunk_size, + is_final=False) + # print("VAD generate", res) + if len(res[0]["value"]): + self.vad_result += VADResponse.from_raw(res) + + def set_callback(self, + callback: Callable, + ): + self.vad_result.time_chunk_index_callback = callback + + def process_vad_result(self, callback: Callable = None): + # 处理VAD结果 + callback = callback if callback is not None else self.vad_result.time_chunk_index_callback + self.vad_result.process_time_chunk( + lambda x : callback( + AudioBinary_Chunk( + start_time=x["start_time"], + end_time=x["end_time"], + chunk=self.audio_chunk.get_chunk(x["start_time"], x["end_time"]) + ) + ) + ) \ No newline at end of file diff --git a/src/logic_trager.py b/src/logic_trager.py index da0034b..68c8cdd 100644 --- a/src/logic_trager.py +++ b/src/logic_trager.py @@ -5,7 +5,7 @@ """ import logging -from typing import Any, Dict, Type +from typing import Any, Dict, Type, Callable # 配置日志 logging.basicConfig( @@ -88,57 +88,62 @@ class AutoAfterMeta(type): 5->6 __after__push_result_queue 调用回调函数 """ +from src.functor import VAD +from src.models import AudioBinary_Config +from src.models import AudioBinary_Chunk +from typing import List + class LogicTrager(metaclass=AutoAfterMeta): """逻辑触发器类""" def __init__(self, audio_chunk_max_size: int = 1024 * 1024 * 10, - sample_rate: int = 16000, - channels: int = 1, - on_result_callback: Callable = None, + audio_config: AudioBinary_Config = None, + result_callback: Callable = None, + models: Dict[str, Any] = None, ): """初始化""" # 存储音频块 - self._audio_chunk = [] + self._audio_chunk : List[AudioBinary_Chunk] = [] # 存储二进制数据 self._audio_chunk_binary = b'' self._audio_chunk_max_size = audio_chunk_max_size # 音频参数 - self._sample_rate = sample_rate - self._channels = channels + self._audio_config = audio_config if audio_config is not None else AudioBinary_Config() # 结果队列 self._result_queue = [] - # 回调函数 - self._on_result_callback = on_result_callback + # 聚合结果回调函数 + self._aggregate_result_callback = result_callback + # 组件 + self._vad = VAD(VAD_model = models.get("vad"), audio_config = self._audio_config) + self._vad.set_callback(self.push_audio_chunk) + + logger.info("初始化LogicTrager") def push_binary_data(self, chunk: bytes) -> None: """ - 添加音频块 + 压入音频块至VAD模块 参数: chunk: 音频数据块 """ - if self._audio_chunk is None: - logger.error("AudioChunk未初始化") - return - - self._audio_chunk_binary += chunk - logger.debug(f"添加音频块,大小: {len(chunk)}字节") + # print("LogicTrager push_binary_data", len(chunk)) + self._vad.push_binary_data(chunk) + self.__after__push_binary_data() def __after__push_binary_data(self) -> None: """ 添加音频块后处理 - VAD检测,将检测到的VAD压入音频块 """ - # VAD检测 - pass - # 压入音频块 push_audio_chunk + # print("LogicTrager __after__push_binary_data") + self._vad.process_vad_result() - def push_audio_chunk(self, chunk: bytes) -> None: + def push_audio_chunk(self, chunk: AudioBinary_Chunk) -> None: """ - 压入音频块 + 音频处理 """ + print("LogicTrager push_audio_chunk [{}ms:{}ms] (len={})".format(chunk.start_time, chunk.end_time, len(chunk.chunk))) self._audio_chunk.append(chunk) def __after__push_audio_chunk(self) -> None: diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..d64a40a --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,3 @@ +from .audiobinary import AudioBinary_Config, AudioBinary_Chunk +from .vad import VADResponse +__all__ = ["AudioBinary_Config", "AudioBinary_Chunk", "VADResponse"] \ No newline at end of file diff --git a/src/models/audiobinary.py b/src/models/audiobinary.py new file mode 100644 index 0000000..3ad0d26 --- /dev/null +++ b/src/models/audiobinary.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel, Field + +class AudioBinary_Config(BaseModel): + """二进制音频块配置信息""" + audio_data: bytes = Field(description="音频数据", default=None) + chunk_size: int = Field(description="块大小", default=100) + chunk_stride: int = Field(description="块步长", default=1600) + sample_rate: int = Field(description="采样率", default=16000) + sample_width: int = Field(description="采样位宽", default=2) + channels: int = Field(description="通道数", default=1) + +class AudioBinary_Chunk(BaseModel): + """音频块""" + start_time: int = Field(description="开始时间", default=0) + end_time: int = Field(description="结束时间", default=0) + chunk: bytes = Field(description="音频块", default=None) diff --git a/src/pydantic_models.py b/src/models/vad.py similarity index 87% rename from src/pydantic_models.py rename to src/models/vad.py index 10c252c..46f087a 100644 --- a/src/pydantic_models.py +++ b/src/models/vad.py @@ -38,11 +38,16 @@ class VADResponse(BaseModel): """处理时间块""" # print("Enter process_time_chunk", self.time_chunk_index, len(self.time_chunk)) while self.time_chunk_index < len(self.time_chunk) - 1: - if self.time_chunk[self.time_chunk_index].end != -1: + index = self.time_chunk_index + if self.time_chunk[index].end != -1: + x = { + "start_time": self.time_chunk[index].start, + "end_time": self.time_chunk[index].end + } if callback is not None: - callback(self.time_chunk_index) + callback(x) elif self.time_chunk_index_callback is not None: - self.time_chunk_index_callback(self.time_chunk_index) + self.time_chunk_index_callback(x) else: print("[Warning] No callback available") self.time_chunk_index += 1 @@ -125,3 +130,14 @@ class VADResponse(BaseModel): result_str += f"[{value_item.start}:{value_item.end}]\n" return result_str + def __iter__(self): + return iter(self.time_chunk) + + def __next__(self): + return next(self.time_chunk) + + def __len__(self): + return len(self.time_chunk) + + def __getitem__(self, index): + return self.time_chunk[index] \ No newline at end of file diff --git a/test_main.py b/test_main.py index 919be7b..9f0c069 100644 --- a/test_main.py +++ b/test_main.py @@ -1,4 +1,4 @@ -from tests.modelsuse import vad_model_use_online +from tests.modelsuse import vad_model_use_online_logic -vad_result = vad_model_use_online("tests/vad_example.wav") +vad_result = vad_model_use_online_logic("tests/vad_example.wav") print(vad_result) \ No newline at end of file diff --git a/tests/modelsuse.py b/tests/modelsuse.py index 35faad9..6aee9fa 100644 --- a/tests/modelsuse.py +++ b/tests/modelsuse.py @@ -1,6 +1,6 @@ from funasr import AutoModel from typing import List, Dict, Any -from src.pydantic_models import VADResponse +from src.models import VADResponse import time def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]: @@ -32,6 +32,36 @@ def vad_model_use_online(file_path: str) -> List[Dict[str, Any]]: # print(item) return vad_result +def vad_model_use_online_logic(file_path: str) -> List[Dict[str, Any]]: + from src.logic_trager import LogicTrager + import soundfile + + from src.config import parse_args + args = parse_args() + + from src.functor.model_loader import load_models + models = load_models(args) + + chunk_size = 200 # ms + from src.models import AudioBinary_Config + import soundfile + + speech, sample_rate = soundfile.read(file_path) + chunk_stride = int(chunk_size * sample_rate / 1000) + audio_config = AudioBinary_Config(sample_rate=sample_rate, sample_width=2, channels=1, chunk_size=chunk_size) + + logic_trager = LogicTrager(models=models, audio_config=audio_config) + for i in range(len(speech)//chunk_stride+1): + speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride] + logic_trager.push_binary_data(speech_chunk) + + # for item in items: + # print(item) + return None + + + if __name__ == "__main__": - vad_result = vad_model_use_online("tests/vad_example.wav") + # vad_result = vad_model_use_online("tests/vad_example.wav") + vad_result = vad_model_use_online_logic("tests/vad_example.wav") # print(vad_result) \ No newline at end of file