简介:本文详细介绍如何使用Python实现录音与语音降噪功能,涵盖录音库选择、音频处理原理、降噪算法实现及代码示例,帮助开发者快速构建语音处理系统。
在语音交互、会议记录、智能客服等场景中,语音质量直接影响用户体验。环境噪声(如风扇声、键盘敲击声)会显著降低语音识别准确率,因此语音降噪成为关键技术环节。Python凭借其丰富的音频处理库(如sounddevice、librosa、noisereduce)和机器学习框架(如TensorFlow、PyTorch),成为语音降噪开发的理想选择。
sounddevice是跨平台的音频I/O库,支持实时录音和播放:
import sounddevice as sdimport numpy as np# 参数设置fs = 44100 # 采样率duration = 5 # 录音时长(秒)filename = "output.wav"# 录音回调函数def callback(indata, frames, time, status):if status:print(status)# 实时处理逻辑可在此添加# 同步录音print("开始录音...")recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')sd.wait() # 等待录音完成sd.play(recording, fs) # 播放验证sd.wait()# 保存为WAV文件(需配合scipy.io.wavfile)from scipy.io.wavfile import writewrite(filename, fs, (recording * 32767).astype(np.int16))
import numpy as npimport librosadef spectral_subtraction(audio_path, noise_path, output_path, n_fft=1024):# 加载语音和噪声y, sr = librosa.load(audio_path, sr=None)noise, _ = librosa.load(noise_path, sr=sr)# 计算STFTY = librosa.stft(y, n_fft=n_fft)Noise = librosa.stft(noise, n_fft=n_fft)# 噪声估计(取前0.5秒)noise_mag = np.mean(np.abs(Noise[:, :int(0.5*sr)]), axis=1)# 频谱减法Y_mag = np.abs(Y)Y_phase = np.angle(Y)Y_mag_clean = np.maximum(Y_mag - noise_mag[:, np.newaxis], 0)# 重建音频Y_clean = Y_mag_clean * np.exp(1j * Y_phase)y_clean = librosa.istft(Y_clean)# 保存结果librosa.output.write_wav(output_path, y_clean, sr)
def wiener_filter(audio_path, noise_path, output_path, n_fft=1024):y, sr = librosa.load(audio_path, sr=None)noise, _ = librosa.load(noise_path, sr=sr)# 计算功率谱Y = librosa.stft(y, n_fft=n_fft)Noise = librosa.stft(noise, n_fft=n_fft)P_y = np.abs(Y)**2P_n = np.mean(np.abs(Noise)**2, axis=1)# 维纳滤波系数alpha = 0.95 # 过减因子H = (P_y - alpha * P_n[:, np.newaxis]) / P_yH = np.maximum(H, 0) # 避免负值# 应用滤波器Y_clean = Y * np.sqrt(H)y_clean = librosa.istft(Y_clean)librosa.output.write_wav(output_path, y_clean, sr)
import noisereduce as nr# 加载音频data, rate = librosa.load("noisy_speech.wav", sr=None)# 选择纯噪声段(前0.5秒)noise_sample = data[:int(0.5*rate)]# 执行降噪reduced_noise = nr.reduce_noise(y=data,sr=rate,y_noise=noise_sample,stationary=False # 非稳态噪声)# 保存结果librosa.output.write_wav("cleaned.wav", reduced_noise, rate)
# 需先安装rnnoise-python: pip install rnnoiseimport rnnoise# 初始化降噪器denoiser = rnnoise.Denoiser()# 逐帧处理(适合实时应用)with open("noisy.wav", "rb") as f:wav_data = f.read()frames = []clean_frames = []for frame in denoiser.process_frames(wav_data):clean_frames.append(frame)# 合并并保存clean_audio = b"".join(clean_frames)with open("cleaned_rnnoise.wav", "wb") as f:f.write(clean_audio)
def calculate_snr(clean, noisy):signal_power = np.mean(clean**2)noise_power = np.mean((noisy - clean)**2)return 10 * np.log10(signal_power / noise_power)
pesq库
def process_frame(frame, model):# 添加零填充padded = np.pad(frame, (512, 512), mode='constant')# FFT处理spectrum = np.fft.rfft(padded)# 降噪逻辑# ...return cleaned_frame
queue.Queue实现线程间通信
import sounddevice as sdimport numpy as npimport librosaimport noisereduce as nrfrom scipy.io.wavfile import writeclass VoiceDenoiser:def __init__(self, sr=16000, frame_size=1024):self.sr = srself.frame_size = frame_sizeself.buffer = np.zeros(frame_size)def record_and_denoise(self, duration=5, output_file="denoised.wav"):# 录音回调def callback(indata, frames, time, status):if status:print(status)self.buffer = np.roll(self.buffer, -frames)self.buffer[-frames:] = indata[:, 0]# 实时处理(简化版)if len(self.buffer) == self.frame_size:# 实际应用中应使用更复杂的噪声估计noise_estimate = self.buffer[:100] # 假设前100个样本是噪声cleaned = nr.reduce_noise(y=self.buffer,sr=self.sr,y_noise=noise_estimate)# 播放或保存处理后的音频# sd.play(cleaned, self.sr)# 开始录音print("开始录音...")stream = sd.InputStream(samplerate=self.sr,channels=1,callback=callback,blocksize=self.frame_size)stream.start()# 模拟录音过程(实际应用中由用户控制)import timetime.sleep(duration)stream.stop()# 保存最终结果(简化版,实际需处理完整缓冲区)write(output_file, self.sr, (self.buffer * 32767).astype(np.int16))print(f"处理完成,结果已保存至 {output_file}")# 使用示例denoiser = VoiceDenoiser(sr=16000)denoiser.record_and_denoise(duration=10)
本文提供的方案覆盖了从基础录音到高级降噪的全流程,开发者可根据实际需求选择适合的方法。对于实时性要求高的场景,建议优先测试RNNoise或轻量级深度学习模型;对于离线处理,频谱减法结合深度学习后处理可获得更好效果。