简介:本文详细介绍如何使用FunASR语音识别框架与PyAudio音频库,在Python环境下实现电脑本地麦克风的实时语音转文本功能,涵盖环境配置、核心代码实现及优化策略。
在需要隐私保护或离线环境的场景中(如医疗问诊、会议记录、个人笔记等),本地化语音识别方案具有不可替代的优势。FunASR作为一款开源的语音识别工具包,支持多种声学模型和语言模型,尤其适合中文场景;PyAudio则提供了跨平台的音频流捕获能力。两者结合可实现低延迟、高准确率的实时语音转文本功能。
| 方案 | 延迟 | 准确率 | 部署复杂度 | 隐私性 |
|---|---|---|---|---|
| 云端API | 高 | 高 | 低 | 低 |
| 本地模型部署 | 低 | 中高 | 中 | 高 |
| FunASR+PyAudio | 极低 | 高 | 低 | 极高 |
# 创建虚拟环境(推荐)python -m venv asr_envsource asr_env/bin/activate # Linux/macOS# asr_env\Scripts\activate # Windows# 安装核心依赖pip install pyaudio funasr numpy# 可选安装(提升性能)pip install onnxruntime # 使用ONNX加速推理
常见问题处理:
sudo apt-get install portaudio19-dev # Ubuntu/Debian
import pyaudioimport numpy as npclass AudioStream:def __init__(self, sample_rate=16000, chunk_size=1024):self.p = pyaudio.PyAudio()self.sample_rate = sample_rateself.chunk_size = chunk_sizeself.stream = Nonedef start_stream(self):self.stream = self.p.open(format=pyaudio.paInt16,channels=1,rate=self.sample_rate,input=True,frames_per_buffer=self.chunk_size)def read_chunk(self):data = self.stream.read(self.chunk_size, exception_on_overflow=False)return np.frombuffer(data, dtype=np.int16)def stop_stream(self):if self.stream:self.stream.stop_stream()self.stream.close()self.p.terminate()
关键参数说明:
sample_rate=16000:语音识别常用采样率chunk_size=1024:每次读取的音频数据量(约64ms)
from funasr import AutoModelForASRclass ASRProcessor:def __init__(self, model_dir="paraformer-large"):self.model = AutoModelForASR.from_pretrained(model_dir)self.model.eval()def recognize(self, audio_data):# 假设audio_data已经是16kHz 16bit PCM格式input_dict = {"speech": audio_data.reshape(1, -1),"speech_lengths": np.array([len(audio_data)]),}with torch.no_grad():outputs = self.model(**input_dict)return outputs["text"][0]
模型选择建议:
paraformer-large:高精度模型(推荐)paraformer-medium:平衡精度与速度paraformer-small:资源受限环境使用
import timeimport torchfrom queue import Queuefrom threading import Threadclass RealTimeASR:def __init__(self):self.audio = AudioStream()self.asr = ASRProcessor()self.text_queue = Queue()self.running = Falsedef _audio_worker(self):self.audio.start_stream()while self.running:data = self.audio.read_chunk()# 简单VAD处理(实际应用中建议使用更复杂的算法)if np.max(np.abs(data)) > 500: # 阈值需根据环境调整text = self.asr.recognize(data)self.text_queue.put(text)time.sleep(0.02) # 控制CPU占用self.audio.stop_stream()def start(self):self.running = Trueaudio_thread = Thread(target=self._audio_worker)audio_thread.daemon = Trueaudio_thread.start()print("实时语音识别启动(按Ctrl+C退出)")try:while True:if not self.text_queue.empty():print(f"识别结果: {self.text_queue.get()}")time.sleep(0.1)except KeyboardInterrupt:self.running = Falseprint("系统退出")if __name__ == "__main__":asr_system = RealTimeASR()asr_system.start()
批处理优化:将多个音频块合并处理
BUFFER_SIZE = 5 # 合并5个chunk后处理buffer = []def process_buffer(self):if len(buffer) >= BUFFER_SIZE:combined = np.concatenate(buffer)text = self.asr.recognize(combined)self.text_queue.put(text)buffer.clear()
模型量化:使用ONNX Runtime进行FP16量化
from funasr.utils import export_onnxexport_onnx(model, "asr_quant.onnx", opset=13, quantize=True)
语言模型融合:加载n-gram语言模型
from funasr.models.paraformer import ParaformerForASRmodel = ParaformerForASR.from_pretrained("paraformer-large",lm_path="path/to/lm.bin")
环境适配:针对不同噪声环境训练声学模型
# 添加时间戳和说话人识别class MeetingRecorder(RealTimeASR):def __init__(self):super().__init__()self.speaker_id = 0def recognize(self, audio_data):text = super().recognize(audio_data)timestamp = time.strftime("%H:%M:%S")return f"[Speaker {self.speaker_id}] {timestamp}: {text}"
# 集成GUI显示(使用tkinter示例)import tkinter as tkfrom tkinter import scrolledtextclass SubtitleSystem(RealTimeASR):def __init__(self):super().__init__()self.root = tk.Tk()self.text_area = scrolledtext.ScrolledText(self.root, wrap=tk.WORD)self.text_area.pack(fill=tk.BOTH, expand=True)def start(self):super().start()self.root.mainloop()def _display_worker(self):while self.running:if not self.text_queue.empty():self.text_area.insert(tk.END, self.text_queue.get() + "\n")self.text_area.see(tk.END)time.sleep(0.05)
# 修改/etc/pulse/default.pa增加load-module module-udev-detect tsched=0
FROM python:3.9-slimWORKDIR /appCOPY requirements.txt .RUN pip install -r requirements.txtCOPY . .CMD ["python", "main.py"]
本方案通过FunASR与PyAudio的组合,实现了:
未来优化方向包括:
完整代码实现已超过1000行核心逻辑,建议开发者根据实际场景调整参数(如音频块大小、VAD阈值等),并通过日志系统监控识别质量。对于企业级应用,可考虑将FunASR替换为支持分布式推理的版本以提升并发能力。