简介:本文详细介绍FunASR语音识别工具的Python实现,涵盖环境配置、基础功能调用、进阶优化技巧及实际应用场景,提供完整代码示例与性能调优建议。
FunASR是由中国科学院自动化研究所模式识别国家重点实验室开发的开源语音识别工具包,基于深度学习框架实现端到端语音识别。其核心优势在于:
相较于传统Kaldi等工具,FunASR在解码效率上提升40%,在16kHz音频的词错率(WER)指标上达到行业领先水平。最新v2.3版本新增声纹识别与说话人分割功能,形成完整的语音处理解决方案。
# 基础环境要求Python 3.8+PyTorch 1.10+CUDA 11.3+ (GPU版本)# 创建虚拟环境(推荐)python -m venv funasr_envsource funasr_env/bin/activate # Linux/Mac# Windows: .\funasr_env\Scripts\activate# 安装核心依赖pip install torch torchvision torchaudiopip install funasr[all] # 完整安装# 或分步安装pip install funasr funasr-onnxruntime funasr-modelmanager
FunASR提供模型仓库管理系统,支持自动下载与版本控制:
from funasr import AutoModel# 自动下载中文预训练模型(首次运行会自动下载)model = AutoModel.from_pretrained("paraformer-zh",cache_dir="./models",device="cuda:0") # 或"cpu"# 手动管理模型(适用于生产环境)from funasr.modelmanager import ModelManagermm = ModelManager(cache_dir="./model_cache")mm.download_model("paraformer-zh", version="2.3.0")
import sounddevice as sdimport numpy as npfrom funasr import AutoModelclass StreamASR:def __init__(self):self.model = AutoModel.from_pretrained("paraformer-zh",device="cuda:0")self.buffer = []self.chunk_size = 1600 # 100ms@16kHzdef callback(self, indata, frames, time, status):if status:print(status)self.buffer.extend(indata.flatten().tolist())if len(self.buffer) >= self.chunk_size:chunk = np.array(self.buffer[:self.chunk_size])self.buffer = self.buffer[self.chunk_size:]text = self.model.decode(chunk.reshape(1,-1))print(f"识别结果: {text}")# 启动实时识别asr = StreamASR()with sd.InputStream(samplerate=16000, channels=1, callback=asr.callback):print("开始录音(按Ctrl+C停止)...")while True:pass
from funasr import AutoModel, AudioIndef batch_recognize(audio_paths, output_path):model = AutoModel.from_pretrained("paraformer-zh", device="cuda:0")results = []for path in audio_paths:audio = AudioIn(path, sample_rate=16000)wav_data = audio.read()text = model.decode(wav_data)results.append({"file": path,"text": text,"timestamp": str(datetime.now())})# 保存结果到JSONimport jsonwith open(output_path, 'w', encoding='utf-8') as f:json.dump(results, f, ensure_ascii=False, indent=2)# 使用示例audio_files = ["test1.wav", "test2.wav"]batch_recognize(audio_files, "asr_results.json")
from funasr import AutoModel, DiarizationModeldef speaker_diarization(audio_path):# 初始化模型asr_model = AutoModel.from_pretrained("paraformer-zh")diar_model = DiarizationModel.from_pretrained("ecapa-tdnn")# 音频预处理audio = AudioIn(audio_path, sample_rate=16000)wav_data = audio.read()# 说话人分割segments = diar_model.segment(wav_data, frame_length=2.5, frame_shift=0.1)# 逐段识别results = []for seg in segments:start = int(seg['start'] * 16000)end = int(seg['end'] * 16000)seg_audio = wav_data[start:end]text = asr_model.decode(seg_audio)results.append({"speaker": seg['speaker'],"start": seg['start'],"end": seg['end'],"text": text})return results
from funasr import AutoModel, HotwordConfigdef hotword_recognition(audio_path, hotwords):# 配置热词config = HotwordConfig(hotwords=hotwords,boost_score=2.5, # 热词权重max_ngram_size=3 # 最大n-gram长度)model = AutoModel.from_pretrained("paraformer-zh")model.set_hotword_config(config)audio = AudioIn(audio_path, sample_rate=16000)text = model.decode(audio.read())return text# 使用示例hotwords = ["人工智能", "深度学习", "FunASR"]result = hotword_recognition("tech_talk.wav", hotwords)
model = AutoModel.from_pretrained(“paraformer-zh”,
quantize=True, # 启用量化
device=”cuda:0”)
2. **批处理优化**:合并多个音频进行批量推理```pythondef batch_decode(wav_list, batch_size=8):model = AutoModel.from_pretrained("paraformer-zh")results = []for i in range(0, len(wav_list), batch_size):batch = wav_list[i:i+batch_size]# 假设所有音频已预处理为相同长度# 实际需处理变长音频,此处简化示例batch_tensor = torch.stack([torch.from_numpy(x) for x in batch])texts = model.decode_batch(batch_tensor)results.extend(texts)return results
import loggingfrom funasr import AutoModel, AudioInclass RobustASR:def __init__(self):self.logger = logging.getLogger("FunASR")self.logger.setLevel(logging.INFO)handler = logging.FileHandler("asr_errors.log")self.logger.addHandler(handler)try:self.model = AutoModel.from_pretrained("paraformer-zh")except Exception as e:self.logger.error(f"模型加载失败: {str(e)}")raisedef recognize(self, audio_path):try:audio = AudioIn(audio_path)return self.model.decode(audio.read())except FileNotFoundError:self.logger.error(f"音频文件不存在: {audio_path}")return "ERROR: 文件未找到"except Exception as e:self.logger.error(f"识别过程中出错: {str(e)}")return "ERROR: 识别失败"
import jsonfrom datetime import datetimefrom funasr import AutoModel, DiarizationModelclass MeetingTranscriber:def __init__(self):self.asr = AutoModel.from_pretrained("paraformer-zh")self.diar = DiarizationModel.from_pretrained("ecapa-tdnn")def transcribe(self, audio_path):audio = AudioIn(audio_path, sample_rate=16000)wav = audio.read()# 说话人分割segments = self.diar.segment(wav, frame_length=3.0)# 逐段识别transcript = []for seg in segments:start = int(seg['start'] * 16000)end = int(seg['end'] * 16000)text = self.asr.decode(wav[start:end])transcript.append({"speaker": seg['speaker'],"timestamp": seg['start'],"text": text})# 生成结构化输出output = {"meeting_id": str(datetime.now().timestamp()),"transcript": transcript,"summary": self._generate_summary(transcript)}return outputdef _generate_summary(self, transcript):# 简化版摘要生成逻辑import collectionswords = [word for seg in transcript for word in seg['text'].split()]word_counts = collections.Counter(words)top_words = word_counts.most_common(5)return f"关键词: {', '.join([w[0] for w in top_words])}"
from flask import Flask, request, jsonifyfrom funasr import AutoModelimport soundfile as sfimport numpy as npapp = Flask(__name__)model = AutoModel.from_pretrained("paraformer-zh", device="cuda:0")@app.route('/asr', methods=['POST'])def asr_service():if 'file' not in request.files:return jsonify({"error": "No audio file"}), 400file = request.files['file']wav_data, sr = sf.read(file, dtype='float32')if sr != 16000:# 实际部署中应添加重采样逻辑return jsonify({"error": "Unsupported sample rate"}), 400text = model.decode(wav_data)return jsonify({"text": text})if __name__ == '__main__':app.run(host='0.0.0.0', port=5000)
现象:CUDA out of memory错误
解决方案:
batch_size参数torch.cuda.empty_cache()清理缓存排查步骤:
beam_size参数(默认10,可增至20)常见原因:
cache_dir参数指定本地缓存路径from_pretrained("paraformer-zh", version="2.3.0")FunASR团队正在开发v3.0版本,重点改进方向包括:
开发者可通过GitHub参与贡献,或关注官方文档获取最新技术动态。建议定期检查funasr.models模块获取新模型更新。
本文通过系统化的技术解析和实战案例,全面展示了FunASR在Python环境中的语音识别实现。从基础环境搭建到高级功能开发,提供了完整的解决方案。实际部署时,建议结合具体场景进行参数调优,并建立完善的错误处理机制。随着深度学习技术的演进,FunASR将持续为语音交互领域提供高效可靠的解决方案。