语音对话COOKBOOK
更新时间:2025-01-17
目标
实现一个语音对话功能,支持多种语音音色。用户可以参考cookbook代码,通过AppBuilder-SDK将语音功能很好地融入自己的平台、应用中。
实现原理
通过循环不断处理用户的语音,将语音转文本,然后进行对话,最后将对话结果通过TTS进行播报。
- 使用大模型的 ASR 进行语音转文本。
- 使用用户自己创建的Agent进行对话,适配用户的应用场景,并具有上下文理解能力。
- 使用大模型的 TTS 进行文本转语音并进行播报。
前置条件
- 使用内置ASR、TTS组件之前,请先开通组件服务 ( 短语音识别-极速版 、 短文本在线合成 ) 并够买额度,可参考开通组件服务。
- pip安装pyaudio、webrtcvad依赖包
- 给程序开放麦克风权限
- 创建好自己的Agent应用
示例代码
# Copyright (c) 2024 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import wave
import sys
import pyaudio
import webrtcvad
import appbuilder
import re
# 请前往千帆AppBuilder官网创建密钥,流程详见
https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
# 设置环境变量
os.environ["APPBUILDER_TOKEN"] = (
"..."
)
# 已发布AppBuilder应用的ID
app_id = "..."
appbuilder.logger.setLoglevel("ERROR")
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1 if sys.platform == "darwin" else 2
RATE = 16000
DURATION = 30 # ms
CHUNK = RATE // 1000 * DURATION
class Chatbot:
def __init__(self):
self.p = pyaudio.PyAudio()
self.tts = appbuilder.TTS()
self.asr = appbuilder.ASR()
self.agent = appbuilder.AppBuilderClient(app_id)
self.conversation_id = self.agent.create_conversation()
def run(self):
self.run_tts_and_play_audio(
"我是你的专属聊天机器人,如果你有什么问题,可以直接问我"
)
while True:
# Record
audio_path = "output.wav"
print("开始记录音频...")
if self.record_audio(audio_path) < 1000:
time.sleep(1)
continue
print("音频记录结束")
# ASR
print("开始执行ASR...")
query = self.run_asr(audio_path)
print("结束执行ASR")
# Agent
print("query: ", query)
if len(query) == 0:
continue
answer = self.run_agent(query)
results = re.findall(r"(https?://[^\s]+)", answer)
for result in results:
print("链接地址:", result)
answer = answer.replace(result, "")
print("answer:", answer)
# TTS
print("开始执行TTS并播报...")
self.run_tts_and_play_audio(answer)
print("结束TTS并播报结束")
def record_audio(self, path):
with wave.open(path, "wb") as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(self.p.get_sample_size(FORMAT))
wf.setframerate(RATE)
stream = self.p.open(
format=FORMAT, channels=CHANNELS, rate=RATE, input=True
)
vad = webrtcvad.Vad(1)
not_speech_times = 0
speech_times = 0
total_times = 0
start_up_times = 33 * 5 # 初始时间设置为5秒
history_speech_times = 0
while True:
if history_speech_times > 33 * 10:
break
data = stream.read(CHUNK, False)
if vad.is_speech(data, RATE):
speech_times += 1
wf.writeframes(data)
else:
not_speech_times += 1
total_times += 1
if total_times >= start_up_times:
history_speech_times += speech_times
# 模拟滑窗重新开始计数
if float(not_speech_times) / float(total_times) > 0.7:
break
not_speech_times = 0
speech_times = 0
total_times = 0
start_up_times = start_up_times / 2
if start_up_times < 33:
start_up_times = 33
stream.close()
return history_speech_times * DURATION
def run_tts_and_play_audio(self, text: str):
# AppBuilder内置的TTS使用文档,用户可根据文档调整参数:
https://github.com/baidubce/app-builder/tree/master/python/core/components/tts
msg = self.tts.run(
appbuilder.Message(content={"text": text}),
speed=5,
pitch=5,
volume=5,
person=0,
audio_type="pcm",
model="paddlespeech-tts",
stream=True,
)
stream = self.p.open(
format=self.p.get_format_from_width(2),
channels=1,
rate=24000,
output=True,
frames_per_buffer=2048,
)
for pcm in msg.content:
stream.write(pcm)
stream.stop_stream()
stream.close()
# AppBuilder内置的ASR使用文档,用户可根据文档调整参数:
https://github.com/baidubce/app-builder/blob/master/python/core/components/asr/README.md
def run_asr(self, audio_path: str):
with open(audio_path, "rb") as f:
content_data = {"audio_format": "wav", "raw_audio": f.read(), "rate": 16000}
msg = appbuilder.Message(content_data)
out = self.asr.run(msg)
text = out.content["result"][0]
return text
def run_agent(self, query):
msg = self.agent.run(self.conversation_id, query, stream=True)
answer = ""
for content in msg.content:
answer += content.answer
return answer
if __name__ == "__main__":
chatbot = Chatbot()
chatbot.run()
使用方法
直接运行程序即可。
也可以将下面的功能模块替换成自己的其他实现或模型:
- record_audio: 录音
- run_asr: 语音识别,AppBuilder ASR组件使用文档
- run_agent: Agent对话功能。
- run_tts_and_play_audio:回复的语音生成并播报。AppBuilder TTS组件使用文档
流式TTS已经上线,测试配额申请地址配额。