简介:本文详细介绍如何使用Python和隐马尔可夫模型(HMM)实现基础语音识别系统,包含PyCharm环境配置、HMM原理解析、MFCC特征提取、模型训练与解码的全流程实践。
pip install numpy scipy librosa hmmlearn matplotlib pyaudio
librosa:音频处理(加载、分帧、特征提取)hmmlearn:HMM模型实现(支持高斯混合模型)pyaudio:实时音频采集(可选)View → Scientific Mode获取交互式绘图窗口Tabnine或Kite插件提升HMM代码编写效率Profiler工具定位MFCC计算瓶颈
import librosadef extract_mfcc(file_path, n_mfcc=13):y, sr = librosa.load(file_path, sr=16000)mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)return mfcc.T # 转置为(时间帧×特征维)
data/├── train/│ ├── 001_hello.wav│ └── 002_world.wav└── test/
Gentle)获取音素级时间标注
from hmmlearn import hmmimport numpy as npclass PhoneHMM:def __init__(self, n_states=5, n_mix=3):self.model = hmm.GMMHMM(n_components=n_states,n_mix=n_mix,covariance_type="diag",init_params="cm",params="cmst",verbose=True)def train(self, X, lengths):# X: (总帧数, 特征维)# lengths: 每段音频的帧数列表self.model.fit(X, lengths)def recognize(self, X):_, state_seq = self.model.decode(X)return state_seq
from sklearn.preprocessing import StandardScalerscaler = StandardScaler()X_train = scaler.fit_transform(X_train)
并行训练:
from joblib import Parallel, delayeddef train_worker(args):model = PhoneHMM()model.train(*args)return modelresults = Parallel(n_jobs=-1)(delayed(train_worker)(data_chunk)for data_chunk in data_chunks)
import matplotlib.pyplot as pltdef plot_states(state_seq):plt.figure(figsize=(12,4))plt.imshow([state_seq], aspect='auto', cmap='viridis')plt.colorbar()plt.title("HMM State Sequence")
plt.hist2d绘制MFCC参数分布热力图NumPy向量化:
# 优化前(循环计算)for i in range(n_frames):frame = X[i]# 处理...# 优化后(矩阵运算)frames = X.reshape(-1, frame_size)
numpy.float16减少内存占用
import librosaimport numpy as npfrom hmmlearn import hmmimport osclass SimpleASR:def __init__(self, phone_models):self.phone_models = phone_models # 音素到HMM模型的映射self.scaler = Nonedef train(self, audio_paths, labels):# 提取所有MFCC特征all_features = []lengths = []for path in audio_paths:mfcc = extract_mfcc(path)all_features.append(mfcc)lengths.append(len(mfcc))X = np.vstack(all_features)# 标准化self.scaler = StandardScaler()X_scaled = self.scaler.fit_transform(X)# 按标签分割训练for label, features in zip(labels, all_features):if label not in self.phone_models:self.phone_models[label] = PhoneHMM()# 这里需要实现按标签分割的逻辑# 实际项目需更复杂的对齐处理def recognize(self, audio_path):mfcc = extract_mfcc(audio_path)X = self.scaler.transform(mfcc)# 简单实现:逐帧分类(实际需Viterbi解码)state_seq = []for frame in X:# 这里应调用所有音素模型计算似然# 简化示例:随机选择state_seq.append(np.random.randint(0,5))return state_seq# 使用示例if __name__ == "__main__":# 初始化(实际需加载真实数据)phone_models = {}asr = SimpleASR(phone_models)# 训练(需替换为真实路径)train_paths = ["data/train/001.wav"]train_labels = ["/a/"]asr.train(train_paths, train_labels)# 识别test_path = "data/test/001.wav"result = asr.recognize(test_path)print("Recognized states:", result)
实时处理:
import pyaudiodef realtime_recognize():p = pyaudio.PyAudio()stream = p.open(format=pyaudio.paInt16,channels=1,rate=16000,input=True,frames_per_buffer=1024)while True:data = np.frombuffer(stream.read(1024), dtype=np.int16)# 实时MFCC提取与识别# ...
hmmlearn中通过learning_rate参数)本文提供的实现框架可作为语音识别研究的起点,实际工业级系统需考虑声学模型、语言模型和解码器的联合优化。建议从简单HMM实现入手,逐步叠加复杂技术模块。