简介:本文深度解析Python语音识别中的特征提取技术,从时域频域分析到MFCC/梅尔频谱实战,结合Librosa与Python_Speech_Features库实现完整流程,助力开发者掌握语音信号处理核心技能。
在语音识别系统中,特征提取是连接原始音频信号与机器学习模型的关键桥梁。人类听觉系统通过耳蜗对声音进行频谱分析,而语音识别系统需要通过数字信号处理技术模拟这一过程。特征提取的质量直接影响模型识别准确率,优秀的特征应具备:
传统语音识别系统采用MFCC(梅尔频率倒谱系数)作为标准特征,而深度学习时代虽出现端到端模型,但特征提取仍是理解语音本质的重要基础。
使用sounddevice和numpy实现实时录音:
import sounddevice as sdimport numpy as npfs = 16000 # 采样率16kHzduration = 5 # 录音时长(秒)print("开始录音...")recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')sd.wait() # 等待录音完成print("录音结束")
提升高频分量,补偿语音信号受口鼻辐射影响的衰减:
def pre_emphasis(signal, coefficient=0.97):return np.append(signal[0], signal[1:] - coefficient * signal[:-1])emphasized_signal = pre_emphasis(recording.flatten())
将连续信号分割为短时帧(通常20-40ms),使用汉明窗减少频谱泄漏:
frame_length = 0.025 * fs # 25ms帧长frame_step = 0.01 * fs # 10ms帧移num_frames = 1 + int(np.ceil(float(len(emphasized_signal) - frame_length) / frame_step))padded_signal = np.zeros((num_frames * frame_step))padded_signal[:len(emphasized_signal)] = emphasized_signalframes = np.lib.stride_tricks.as_strided(padded_signal,shape=(num_frames, frame_length),strides=(frame_step * padded_signal.itemsize, padded_signal.itemsize))# 应用汉明窗hamming_window = np.hamming(frame_length)frames *= hamming_window
def calculate_energy(frames):return np.sum(np.square(frames), axis=1)energy = calculate_energy(frames)
def calculate_zero_crossing_rate(frames):zero_crossings = np.where(np.diff(np.sign(frames)))[0]return len(zero_crossings) / float(frames.shape[1])zcr = np.array([calculate_zero_crossing_rate(frame) for frame in frames])
def calculate_fft(frames):mag_frames = np.absolute(np.fft.rfft(frames, 2048)) # FFT大小通常为2的幂次return mag_frames[:, :1024] # 取前半部分fft_frames = calculate_fft(frames)
def calculate_power_spectrum(fft_frames):return ((1.0 / 2048) * np.square(fft_frames))[:, :1024]power_spectrum = calculate_power_spectrum(fft_frames)
import librosadef extract_mfcc(signal, sr=16000, n_mfcc=13):return librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)# 使用python_speech_features库from python_speech_features import mfccmfcc_features = mfcc(recording.flatten(), samplerate=16000, winlen=0.025, winstep=0.01,numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None,preemph=0.97, ceplifter=22, appendEnergy=True)
def apply_cmvn(features):mean = np.mean(features, axis=0)std = np.std(features, axis=0)return (features - mean) / (std + 1e-6) # 避免除零normalized_mfcc = apply_cmvn(mfcc_features)
def calculate_delta(features, delta_order=1):if delta_order == 1:delta = np.zeros_like(features)for i in range(1, features.shape[0]-1):delta[i] = features[i+1] - features[i-1]delta[0] = features[1] - features[0]delta[-1] = features[-1] - features[-2]return delta / 2.0else:return calculate_delta(calculate_delta(features), delta_order-1)delta_mfcc = calculate_delta(mfcc_features)delta_delta_mfcc = calculate_delta(mfcc_features, 2)
# 拼接静态、一阶差分、二阶差分特征combined_features = np.hstack([mfcc_features,delta_mfcc,delta_delta_mfcc])
speech_recognition/├── audio_processor.py # 音频采集与预处理├── feature_extractor.py # 特征提取核心├── utils.py # 辅助工具函数└── main.py # 主程序入口
# main.pyimport numpy as npimport sounddevice as sdfrom feature_extractor import MFCCExtractorclass SpeechRecognizer:def __init__(self, sample_rate=16000):self.sample_rate = sample_rateself.extractor = MFCCExtractor()def record_audio(self, duration=5):print("开始录音...")recording = sd.rec(int(duration * self.sample_rate),samplerate=self.sample_rate,channels=1,dtype='float32')sd.wait()print("录音结束")return recording.flatten()def extract_features(self, audio_data):return self.extractor.extract(audio_data)if __name__ == "__main__":recognizer = SpeechRecognizer()audio = recognizer.record_audio()features = recognizer.extract_features(audio)print(f"提取的特征维度: {features.shape}")
# feature_extractor.pyimport numpy as npfrom python_speech_features import mfcc, deltaclass MFCCExtractor:def __init__(self, sample_rate=16000, num_cep=13):self.sample_rate = sample_rateself.num_cep = num_cepdef extract(self, audio_data):# 基础MFCC提取static_mfcc = mfcc(audio_data,samplerate=self.sample_rate,winlen=0.025,winstep=0.01,numcep=self.num_cep,nfilt=26,nfft=512,preemph=0.97,ceplifter=22,appendEnergy=False)# 计算差分特征delta_mfcc = delta(static_mfcc, 1)delta_delta_mfcc = delta(static_mfcc, 2)# 特征拼接return np.hstack([static_mfcc, delta_mfcc, delta_delta_mfcc])
本系列文章后续将深入探讨这些高级主题,帮助读者构建更强大的语音识别系统。通过掌握特征提取这一核心技术,开发者不仅能够理解语音识别的底层原理,更能为后续的声学模型训练奠定坚实基础。