简介:本文详细介绍如何使用Python构建语音情感识别系统,涵盖声学特征提取、模型训练与部署全流程,提供完整代码示例和实用建议。
语音情感识别(Speech Emotion Recognition, SER)作为人机交互领域的关键技术,通过分析语音信号中的声学特征(如音调、语速、能量等)判断说话者的情感状态。该技术在智能客服、心理健康监测、教育评估等领域具有广泛应用前景。Python凭借其丰富的音频处理库和机器学习框架,成为开发SER系统的首选语言。
# 环境依赖安装命令!pip install librosa scikit-learn tensorflow soundfile
推荐使用Anaconda创建虚拟环境,确保库版本兼容性。关键库版本要求:
import librosaimport numpy as npdef preprocess_audio(file_path, sr=16000, frame_length=0.025, hop_length=0.01):"""音频预处理函数:param file_path: 音频文件路径:param sr: 采样率:param frame_length: 帧长(秒):param hop_length: 帧移(秒):return: 分帧后的音频信号"""# 加载音频文件y, sr = librosa.load(file_path, sr=sr)# 降噪处理(示例:简单阈值降噪)y = np.where(np.abs(y) > 0.01, y, 0)# 分帧参数计算frame_size = int(frame_length * sr)hop_size = int(hop_length * sr)# 分帧处理frames = librosa.util.frame(y, frame_length=frame_size, hop_length=hop_size)return frames, sr
def extract_features(y, sr):"""多特征提取函数:param y: 音频信号:param sr: 采样率:return: 特征向量"""features = {}# 梅尔频率倒谱系数(MFCC)mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)features['mfcc_mean'] = np.mean(mfcc, axis=1)features['mfcc_std'] = np.std(mfcc, axis=1)# 基频特征f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'),fmax=librosa.note_to_hz('C7'))features['f0_mean'] = np.mean(f0[voiced_flag])features['f0_std'] = np.std(f0[voiced_flag])# 能量特征rms = librosa.feature.rms(y=y)features['energy_mean'] = np.mean(rms)features['energy_std'] = np.std(rms)# 过零率zcr = librosa.feature.zero_crossing_rate(y)features['zcr_mean'] = np.mean(zcr)return features
from sklearn.svm import SVCfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScaler# 假设已有特征矩阵X和标签yX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)# 特征标准化scaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train)X_test_scaled = scaler.transform(X_test)# SVM模型训练svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')svm_model.fit(X_train_scaled, y_train)# 评估print(f"SVM Accuracy: {svm_model.score(X_test_scaled, y_test):.2f}")
import tensorflow as tffrom tensorflow.keras import layers, modelsdef build_lstm_model(input_shape, num_classes):"""构建LSTM情感识别模型:param input_shape: 输入特征形状:param num_classes: 情感类别数:return: 编译好的Keras模型"""model = models.Sequential([layers.LSTM(64, return_sequences=True, input_shape=input_shape),layers.Dropout(0.3),layers.LSTM(32),layers.Dense(32, activation='relu'),layers.Dense(num_classes, activation='softmax')])model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])return model# 示例使用input_shape = (100, 13) # 假设100帧,每帧13维MFCCmodel = build_lstm_model(input_shape, 5) # 5种情感类别model.summary()
import randomdef augment_audio(y, sr):"""音频数据增强:param y: 原始音频:param sr: 采样率:return: 增强后的音频"""# 随机时间拉伸rate = random.uniform(0.8, 1.2)y_stretched = librosa.effects.time_stretch(y, rate)# 随机音高偏移n_steps = random.randint(-3, 3)y_shifted = librosa.effects.pitch_shift(y_stretched, sr, n_steps=n_steps)# 随机添加噪声noise_amp = 0.005 * random.random() * np.max(y_shifted)y_noisy = y_shifted + noise_amp * np.random.normal(size=y_shifted.shape)return y_noisy
early_stopping = EarlyStopping(monitor=’val_loss’, patience=10)
model.fit(X_train, y_train,
validation_split=0.2,
epochs=100,
callbacks=[early_stopping])
## 五、部署与应用建议### 1. 模型导出与部署```python# 导出为SavedModel格式model.save('emotion_recognition_model')# 或导出为TensorFlow Lite格式(移动端部署)converter = tf.lite.TFLiteConverter.from_keras_model(model)tflite_model = converter.convert()with open('emotion_model.tflite', 'wb') as f:f.write(tflite_model)
import sounddevice as sddef realtime_recognition(model, scaler):"""实时语音情感识别:param model: 训练好的模型:param scaler: 特征标准化器"""def callback(indata, frames, time, status):if status:print(status)# 实时特征提取y = indata.flatten()features = extract_features(y, sr=16000)# 特征向量化(需适配模型输入)# ...# 预测# emotion = model.predict(...)# print(f"Detected emotion: {emotion}")with sd.InputStream(samplerate=16000, channels=1, callback=callback):print("Start speaking... (Ctrl+C to stop)")while True:pass
该项目完整实现约需300-500行代码,建议采用模块化设计便于维护。实际开发中需特别注意音频数据的采样率一致性处理,这是导致模型性能下降的常见原因。对于商业应用,建议考虑使用ONNX Runtime等优化推理引擎提升性能。