9.5 KiB
9.5 KiB
FunASR-FastAPI WebSocket API 文档
本文档详细介绍了如何连接和使用 FunASR-FastAPI 实时语音识别服务的 WebSocket 接口。
1. 连接端点 (Endpoint)
服务的 WebSocket 端点 URL 格式如下:
ws://<your_server_host>:8000/ws/asr/{session_id}?mode=<client_mode>
参数说明
-
{session_id}
(路径参数,str
, 必需): 用于唯一标识一个识别会话(例如,一场会议或一次直播)。所有属于同一次会话的客户端都应使用相同的session_id
。 -
mode
(查询参数,str
, 必需): 定义客户端的角色。sender
: 音频发送者。一个会话中应该只有一个sender
。此客户端负责将实时音频流发送到服务器。receiver
: 结果接收者。一个会话中可以有多个receiver
。此客户端只接收由服务器广播的识别结果,不发送音频。
2. 数据格式
2.1 发送数据 (Sender -> Server)
- 音频格式:
sender
必须发送原始的 PCM 音频数据。- 采样率: 16000 Hz
- 位深: 16-bit (signed integer)
- 声道数: 单声道 (Mono)
- 传输格式: 必须以二进制 (bytes) 格式发送。
- 结束信号: 当音频流结束时,
sender
应发送一个文本消息"close"
来通知服务器关闭会话。
2.2 接收数据 (Server -> Receiver)
服务器会将识别结果以 JSON 文本 格式广播给会话中的所有 receiver
(以及 sender
自己)。JSON 对象的结构示例如下:
{
"asr": "你好,世界。",
"spk": {
"speaker_id": "uuid-of-the-speaker",
"speaker_name": "SpeakerName",
"score": 0.98
}
}
3. Python 客户端示例
需要安装 websockets
库: pip install websockets
3.1 Python Sender 示例 (发送本地音频文件)
这个脚本会读取一个 WAV 文件,并将其内容以流式方式发送到服务器。
import asyncio
import websockets
import soundfile as sf
import uuid
# --- 配置 ---
SERVER_URI = "ws://localhost:8000/ws/asr/{session_id}?mode=sender"
SESSION_ID = str(uuid.uuid4()) # 为这次会话生成一个唯一的ID
AUDIO_FILE = "tests/XT_ZZY_denoise.wav" # 替换为你的音频文件路径
CHUNK_SIZE = 3200 # 每次发送 100ms 的音频数据 (16000 * 2 * 0.1)
async def send_audio():
"""连接到服务器,并流式发送音频文件"""
uri = SERVER_URI.format(session_id=SESSION_ID)
print(f"作为 Sender 连接到: {uri}")
async with websockets.connect(uri) as websocket:
try:
# 读取音频文件
with sf.SoundFile(AUDIO_FILE, 'r') as f:
assert f.samplerate == 16000, "音频文件采样率必须为 16kHz"
assert f.channels == 1, "音频文件必须为单声道"
print("开始发送音频...")
while True:
data = f.read(CHUNK_SIZE, dtype='int16')
if not data.any():
break
# 将 numpy 数组转换为原始字节流
await websocket.send(data.tobytes())
await asyncio.sleep(0.1) # 模拟实时音频输入
print("音频发送完毕,发送结束信号。")
await websocket.send("close")
# 等待服务器的最终确认或关闭连接
response = await websocket.recv()
print(f"收到服务器最终响应: {response}")
except websockets.exceptions.ConnectionClosed as e:
print(f"连接已关闭: {e}")
except Exception as e:
print(f"发生错误: {e}")
if __name__ == "__main__":
asyncio.run(send_audio())
3.2 Python Receiver 示例 (接收识别结果)
这个脚本会连接到指定的会话,并持续打印服务器广播的识别结果。
import asyncio
import websockets
# --- 配置 ---
# !!! 必须和 Sender 使用相同的 SESSION_ID !!!
SERVER_URI = "ws://localhost:8000/ws/asr/{session_id}?mode=receiver"
SESSION_ID = "在此处粘贴你的Sender会话ID"
async def receive_results():
"""连接到服务器并接收识别结果"""
if "粘贴你的Sender会话ID" in SESSION_ID:
print("错误:请先设置有效的 SESSION_ID!")
return
uri = SERVER_URI.format(session_id=SESSION_ID)
print(f"作为 Receiver 连接到: {uri}")
async with websockets.connect(uri) as websocket:
try:
print("等待接收识别结果...")
while True:
message = await websocket.recv()
print(f"收到结果: {message}")
except websockets.exceptions.ConnectionClosed as e:
print(f"连接已关闭: {e.code} {e.reason}")
if __name__ == "__main__":
asyncio.run(receive_results())
4. JavaScript 客户端示例 (浏览器)
这个示例展示了如何在网页上通过麦克风获取音频,并将其作为 sender
发送。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>WebSocket ASR Client</title>
</head>
<body>
<h1>FunASR WebSocket Client (Sender)</h1>
<p><strong>Session ID:</strong> <span id="sessionId"></span></p>
<button id="startButton">开始识别</button>
<button id="stopButton" disabled>停止识别</button>
<h2>识别结果:</h2>
<div id="results"></div>
<script>
const startButton = document.getElementById('startButton');
const stopButton = document.getElementById('stopButton');
const resultsDiv = document.getElementById('results');
const sessionIdSpan = document.getElementById('sessionId');
let websocket;
let audioContext;
let scriptProcessor;
let mediaStream;
const CHUNK_DURATION_MS = 100; // 每100ms发送一次数据
const SAMPLE_RATE = 16000;
// 生成一个简单的UUID
function generateUUID() {
return ([1e7]+-1e3+-4e3+-8e3+-1e11).replace(/[018]/g, c =>
(c ^ crypto.getRandomValues(new Uint8Array(1))[0] & 15 >> c / 4).toString(16)
);
}
async function startRecording() {
const sessionId = generateUUID();
sessionIdSpan.textContent = sessionId;
const wsUrl = `ws://${window.location.host}/ws/asr/${sessionId}?mode=sender`;
websocket = new WebSocket(wsUrl);
websocket.onopen = () => {
console.log("WebSocket 连接已打开");
startButton.disabled = true;
stopButton.disabled = false;
resultsDiv.innerHTML = '';
};
websocket.onmessage = (event) => {
console.log("收到消息:", event.data);
const result = JSON.parse(event.data);
const asrText = result.asr || '';
const spkName = result.spk ? result.spk.speaker_name : 'Unknown';
resultsDiv.innerHTML += `<p><strong>${spkName}:</strong> ${asrText}</p>`;
};
websocket.onclose = () => {
console.log("WebSocket 连接已关闭");
stopRecording();
};
websocket.onerror = (error) => {
console.error("WebSocket 错误:", error);
alert("WebSocket 连接失败!");
stopRecording();
};
try {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
const source = audioContext.createMediaStreamSource(mediaStream);
const bufferSize = CHUNK_DURATION_MS * SAMPLE_RATE / 1000 * 2; // 计算缓冲区大小
scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
scriptProcessor.onaudioprocess = (e) => {
if (websocket && websocket.readyState === WebSocket.OPEN) {
const inputData = e.inputBuffer.getChannelData(0);
// 服务器期望16-bit PCM,需要转换
const pcmData = new Int16Array(inputData.length);
for (let i = 0; i < inputData.length; i++) {
pcmData[i] = Math.max(-1, Math.min(1, inputData[i])) * 32767;
}
websocket.send(pcmData.buffer);
}
};
source.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
} catch (err) {
console.error("无法获取麦克风:", err);
alert("无法获取麦克风权限!");
if (websocket) websocket.close();
}
}
function stopRecording() {
if (websocket && websocket.readyState === WebSocket.OPEN) {
websocket.send("close");
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
}
if (scriptProcessor) {
scriptProcessor.disconnect();
}
if (audioContext) {
audioContext.close();
}
startButton.disabled = false;
stopButton.disabled = true;
}
startButton.addEventListener('click', startRecording);
stopButton.addEventListener('click', stopRecording);
</script>
</body>
</html>