262 lines
9.3 KiB
Markdown
262 lines
9.3 KiB
Markdown
# FunASR-FastAPI WebSocket API 文档
|
||
|
||
本文档详细介绍了如何连接和使用 FunASR-FastAPI 实时语音识别服务的 WebSocket 接口。
|
||
|
||
## 1. 连接端点 (Endpoint)
|
||
|
||
服务的 WebSocket 端点 URL 格式如下:
|
||
|
||
```
|
||
ws://<your_server_host>:8000/ws/asr/{session_id}?mode=<client_mode>
|
||
```
|
||
|
||
### 参数说明
|
||
|
||
- **`{session_id}`** (路径参数, `str`, **必需**):
|
||
用于唯一标识一个识别会话(例如,一场会议或一次直播)。所有属于同一次会话的客户端都应使用相同的 `session_id`。
|
||
|
||
- **`mode`** (查询参数, `str`, **必需**):
|
||
定义客户端的角色。
|
||
- `sender`: 音频发送者。一个会话中应该只有一个 `sender`。此客户端负责将实时音频流发送到服务器。
|
||
- `receiver`: 结果接收者。一个会话中可以有多个 `receiver`。此客户端只接收由服务器广播的识别结果,不发送音频。
|
||
|
||
## 2. 数据格式
|
||
|
||
### 2.1 发送数据 (Sender -> Server)
|
||
|
||
- **音频格式**: `sender` 必须发送原始的 **PCM 音频数据**。
|
||
- **采样率**: 16000 Hz
|
||
- **位深**: 32-bit (floating point)
|
||
- **声道数**: 单声道 (Mono)
|
||
- **传输格式**: 必须以**二进制 (bytes)** 格式发送。
|
||
- **结束信号**: 当音频流结束时,`sender` 应发送一个**文本消息** `"close"` 来通知服务器关闭会话。
|
||
|
||
### 2.2 接收数据 (Server -> Receiver)
|
||
|
||
服务器会将识别结果以 **JSON 文本** 格式广播给会话中的所有 `receiver`(以及 `sender` 自己)。JSON 对象的结构示例如下:
|
||
|
||
```json
|
||
{
|
||
"asr": "你好,世界。",
|
||
"spk": {
|
||
"speaker_id": "uuid-of-the-speaker",
|
||
"speaker_name": "SpeakerName",
|
||
"score": 0.98
|
||
}
|
||
}
|
||
```
|
||
|
||
## 3. Python 客户端示例
|
||
|
||
需要安装 `websockets` 库: `pip install websockets`
|
||
|
||
### 3.1 Python Sender 示例 (发送本地音频文件)
|
||
|
||
这个脚本会读取一个 WAV 文件,并将其内容以流式方式发送到服务器。
|
||
|
||
```python
|
||
import asyncio
|
||
import websockets
|
||
import soundfile as sf
|
||
import uuid
|
||
|
||
# --- 配置 ---
|
||
SERVER_URI = "ws://localhost:8000/ws/asr/{session_id}?mode=sender"
|
||
SESSION_ID = str(uuid.uuid4()) # 为这次会话生成一个唯一的ID
|
||
AUDIO_FILE = "tests/XT_ZZY_denoise.wav" # 替换为你的音频文件路径
|
||
CHUNK_SIZE = 3200 # 对应 100ms 的 float32 数据 (16000 * 4 * 0.1)
|
||
|
||
async def send_audio():
|
||
"""连接到服务器,并流式发送音频文件"""
|
||
uri = SERVER_URI.format(session_id=SESSION_ID)
|
||
print(f"作为 Sender 连接到: {uri}")
|
||
|
||
async with websockets.connect(uri) as websocket:
|
||
try:
|
||
# 读取音频文件
|
||
with sf.SoundFile(AUDIO_FILE, 'r') as f:
|
||
assert f.samplerate == 16000, "音频文件采样率必须为 16kHz"
|
||
assert f.channels == 1, "音频文件必须为单声道"
|
||
|
||
print("开始发送音频...")
|
||
while True:
|
||
# 读取为 float32 类型
|
||
data = f.read(CHUNK_SIZE, dtype='float32')
|
||
if not data.any():
|
||
break
|
||
# 将 numpy 数组转换为原始字节流
|
||
await websocket.send(data.tobytes())
|
||
await asyncio.sleep(0.1) # 模拟实时音频输入
|
||
|
||
print("音频发送完毕,发送结束信号。")
|
||
await websocket.send("close")
|
||
|
||
# 等待服务器的最终确认或关闭连接
|
||
response = await websocket.recv()
|
||
print(f"收到服务器最终响应: {response}")
|
||
|
||
except websockets.exceptions.ConnectionClosed as e:
|
||
print(f"连接已关闭: {e}")
|
||
except Exception as e:
|
||
print(f"发生错误: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(send_audio())
|
||
```
|
||
|
||
### 3.2 Python Receiver 示例 (接收识别结果)
|
||
|
||
这个脚本会连接到指定的会话,并持续打印服务器广播的识别结果。
|
||
|
||
```python
|
||
import asyncio
|
||
import websockets
|
||
|
||
# --- 配置 ---
|
||
# !!! 必须和 Sender 使用相同的 SESSION_ID !!!
|
||
SERVER_URI = "ws://localhost:8000/ws/asr/{session_id}?mode=receiver"
|
||
SESSION_ID = "在此处粘贴你的Sender会话ID"
|
||
|
||
async def receive_results():
|
||
"""连接到服务器并接收识别结果"""
|
||
if "粘贴你的Sender会话ID" in SESSION_ID:
|
||
print("错误:请先设置有效的 SESSION_ID!")
|
||
return
|
||
|
||
uri = SERVER_URI.format(session_id=SESSION_ID)
|
||
print(f"作为 Receiver 连接到: {uri}")
|
||
|
||
async with websockets.connect(uri) as websocket:
|
||
try:
|
||
print("等待接收识别结果...")
|
||
while True:
|
||
message = await websocket.recv()
|
||
print(f"收到结果: {message}")
|
||
except websockets.exceptions.ConnectionClosed as e:
|
||
print(f"连接已关闭: {e.code} {e.reason}")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(receive_results())
|
||
```
|
||
|
||
## 4. JavaScript 客户端示例 (浏览器)
|
||
|
||
这个示例展示了如何在网页上通过麦克风获取音频,并将其作为 `sender` 发送。
|
||
|
||
```html
|
||
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<title>WebSocket ASR Client</title>
|
||
</head>
|
||
<body>
|
||
<h1>FunASR WebSocket Client (Sender)</h1>
|
||
<p><strong>Session ID:</strong> <span id="sessionId"></span></p>
|
||
<button id="startButton">开始识别</button>
|
||
<button id="stopButton" disabled>停止识别</button>
|
||
<h2>识别结果:</h2>
|
||
<div id="results"></div>
|
||
|
||
<script>
|
||
const startButton = document.getElementById('startButton');
|
||
const stopButton = document.getElementById('stopButton');
|
||
const resultsDiv = document.getElementById('results');
|
||
const sessionIdSpan = document.getElementById('sessionId');
|
||
|
||
let websocket;
|
||
let audioContext;
|
||
let scriptProcessor;
|
||
let mediaStream;
|
||
|
||
const CHUNK_DURATION_MS = 100; // 每100ms发送一次数据
|
||
const SAMPLE_RATE = 16000;
|
||
|
||
// 生成一个简单的UUID
|
||
function generateUUID() {
|
||
return ([1e7]+-1e3+-4e3+-8e3+-1e11).replace(/[018]/g, c =>
|
||
(c ^ crypto.getRandomValues(new Uint8Array(1))[0] & 15 >> c / 4).toString(16)
|
||
);
|
||
}
|
||
|
||
async function startRecording() {
|
||
const sessionId = generateUUID();
|
||
sessionIdSpan.textContent = sessionId;
|
||
const wsUrl = `ws://${window.location.host}/ws/asr/${sessionId}?mode=sender`;
|
||
|
||
websocket = new WebSocket(wsUrl);
|
||
websocket.onopen = () => {
|
||
console.log("WebSocket 连接已打开");
|
||
startButton.disabled = true;
|
||
stopButton.disabled = false;
|
||
resultsDiv.innerHTML = '';
|
||
};
|
||
|
||
websocket.onmessage = (event) => {
|
||
console.log("收到消息:", event.data);
|
||
const result = JSON.parse(event.data);
|
||
const asrText = result.asr || '';
|
||
const spkName = result.spk ? result.spk.speaker_name : 'Unknown';
|
||
resultsDiv.innerHTML += `<p><strong>${spkName}:</strong> ${asrText}</p>`;
|
||
};
|
||
|
||
websocket.onclose = () => {
|
||
console.log("WebSocket 连接已关闭");
|
||
stopRecording();
|
||
};
|
||
|
||
websocket.onerror = (error) => {
|
||
console.error("WebSocket 错误:", error);
|
||
alert("WebSocket 连接失败!");
|
||
stopRecording();
|
||
};
|
||
|
||
try {
|
||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
|
||
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
|
||
|
||
const source = audioContext.createMediaStreamSource(mediaStream);
|
||
const bufferSize = CHUNK_DURATION_MS * SAMPLE_RATE / 1000 * 4; // 计算缓冲区大小
|
||
scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
|
||
|
||
scriptProcessor.onaudioprocess = (e) => {
|
||
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||
const inputData = e.inputBuffer.getChannelData(0);
|
||
// 服务器期望 float32 数据,inputData 本身就是 Float32Array,直接发送其 buffer
|
||
websocket.send(inputData.buffer);
|
||
}
|
||
};
|
||
|
||
source.connect(scriptProcessor);
|
||
scriptProcessor.connect(audioContext.destination);
|
||
|
||
} catch (err) {
|
||
console.error("无法获取麦克风:", err);
|
||
alert("无法获取麦克风权限!");
|
||
if (websocket) websocket.close();
|
||
}
|
||
}
|
||
|
||
function stopRecording() {
|
||
if (websocket && websocket.readyState === WebSocket.OPEN) {
|
||
websocket.send("close");
|
||
}
|
||
if (mediaStream) {
|
||
mediaStream.getTracks().forEach(track => track.stop());
|
||
}
|
||
if (scriptProcessor) {
|
||
scriptProcessor.disconnect();
|
||
}
|
||
if (audioContext) {
|
||
audioContext.close();
|
||
}
|
||
startButton.disabled = false;
|
||
stopButton.disabled = true;
|
||
}
|
||
|
||
startButton.addEventListener('click', startRecording);
|
||
stopButton.addEventListener('click', stopRecording);
|
||
</script>
|
||
</body>
|
||
</html>
|
||
``` |