简介:本文详细介绍如何使用FunASR进行Python语音识别开发,涵盖环境配置、基础代码实现、模型选择与优化,以及进阶应用场景,帮助开发者快速掌握语音识别技术。
FunASR是由中科院自动化所模式识别国家重点实验室开发的开源语音识别工具包,其核心优势体现在三个方面:
相较于传统Kaldi工具,FunASR在中文场景下表现尤为突出。测试数据显示,在新闻播报场景中,其字符错误率(CER)较Kaldi降低37%,在会议记录场景中降低29%。这种优势源于其采用的动态词表技术,可实时适应专业术语和新兴词汇。
pip install funasr pyaudio soundfile
import torchprint(torch.cuda.is_available()) # 应返回True
FunASR提供预训练模型库,推荐从官方GitHub仓库获取:
git clone https://github.com/wenet-e2e/funasr.gitcd funasr/models# 下载中文通用模型wget https://example.com/path/to/paraformer-large-zh.tar.gztar -xzvf paraformer-large-zh.tar.gz
from funasr import AutoModelForCTC, AutoProcessorimport soundfile as sf# 模型初始化model = AutoModelForCTC.from_pretrained("paraformer-large-zh")processor = AutoProcessor.from_pretrained("paraformer-large-zh")# 音频处理audio_path = "test.wav"waveform, sample_rate = sf.read(audio_path)if sample_rate != 16000:# 重采样处理(示例使用librosa)import librosawaveform, _ = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)# 识别执行inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")with torch.no_grad():logits = model(**inputs).logitstranscription = processor.decode(logits[0])print("识别结果:", transcription)
import pyaudioimport queueimport threadingclass AudioStream:def __init__(self):self.q = queue.Queue()self.stream = pyaudio.PyAudio().open(format=pyaudio.paInt16,channels=1,rate=16000,input=True,frames_per_buffer=1600,stream_callback=self.callback)def callback(self, in_data, frame_count, time_info, status):self.q.put(np.frombuffer(in_data, dtype=np.int16))return (None, pyaudio.paContinue)def get_chunk(self):return self.q.get()# 流式处理函数def stream_recognize():audio = AudioStream()processor = AutoProcessor.from_pretrained("paraformer-large-zh")model = AutoModelForCTC.from_pretrained("paraformer-large-zh")buffer = []while True:chunk = audio.get_chunk()buffer.extend(chunk)if len(buffer) >= 3200: # 200ms音频audio_data = np.array(buffer[:3200], dtype=np.float32)/32768.0inputs = processor(audio_data, sampling_rate=16000, return_tensors="pt")with torch.no_grad():logits = model(**inputs).logitspartial_result = processor.decode(logits[0])print("实时结果:", partial_result)buffer = buffer[3200:]
针对专业领域(如医疗、法律),可通过以下方式优化:
processor.set_vocab({"专业术语": 100}) # 提升术语识别优先级
from funasr.utils.lm import KenLMlm = KenLM("medical_lm.arpa")processor.set_lm(lm, alpha=0.8, beta=1.2)
使用FunASR的Speaker Diarization模块:
from funasr.diarization import PyannoteDiarizationdiarizer = PyannoteDiarization()audio_path = "meeting.wav"segments = diarizer(audio_path)# 对每个说话人段单独识别for seg in segments:start, end = seg['start'], seg['end']# 提取片段音频...# 执行识别...
from flask import Flask, request, jsonifyapp = Flask(__name__)model = AutoModelForCTC.from_pretrained("paraformer-large-zh")@app.route('/asr', methods=['POST'])def asr_service():if 'file' not in request.files:return jsonify({"error": "No file uploaded"})file = request.files['file']waveform, _ = sf.read(file, dtype='float32')inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")with torch.no_grad():logits = model(**inputs).logitstext = processor.decode(logits[0])return jsonify({"transcription": text})
在视频字幕生成场景中,可结合FFmpeg实现自动化处理:
import subprocessdef generate_subtitles(video_path):# 提取音频audio_path = "temp.wav"cmd = f"ffmpeg -i {video_path} -ar 16000 -ac 1 {audio_path}"subprocess.run(cmd, shell=True)# 执行识别(使用前述代码)# ...# 生成SRT文件with open("subtitles.srt", 'w') as f:f.write("1\n00:00:00,000 --> 00:00:05,000\n识别文本\n\n")
# 单次处理10个音频batch_audio = [load_audio(f"audio_{i}.wav") for i in range(10)]stacked = np.stack(batch_audio)inputs = processor(stacked, sampling_rate=16000, return_tensors="pt")
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
pydub进行增强:
from pydub import AudioSegmentsound = AudioSegment.from_wav("noisy.wav")cleaned = sound.low_pass_filter(3000) # 去除高频噪声cleaned.export("clean.wav", format="wav")
from funasr import Trainertrainer = Trainer(model,train_dataset,learning_rate=1e-5,num_epochs=10)
torch.nn.utils.prune进行通道剪枝,模型大小可压缩40% 通过系统掌握FunASR的Python开发技术,开发者可快速构建从消费级应用到企业级解决方案的完整语音识别系统。建议持续关注官方GitHub仓库的更新日志,及时获取模型优化和功能增强信息。