简介:本文详解如何使用Python将SRT字幕文件转换为语音,涵盖主流文字转语音模块对比、SRT文件解析方法及完整代码实现,提供从环境配置到优化部署的全流程指导。
在视频制作、无障碍访问和语言学习场景中,将字幕文件转换为语音具有重要实用价值。SRT(SubRip Subtitle)作为最常用的字幕格式,其时间轴信息可精确控制语音合成时机。Python凭借丰富的文本处理库和语音合成模块,成为实现该功能的理想选择。
import pyttsx3engine = pyttsx3.init()engine.setProperty('rate', 150) # 语速engine.setProperty('volume', 0.9) # 音量engine.say("这是测试语音")engine.runAndWait()
特点:
import asynciofrom edge_tts import Communicateasync def text_to_speech():communicate = Communicate(text="你好世界", voice="zh-CN-YunxiNeural")await communicate.save("output.mp3")asyncio.run(text_to_speech())
特点:
# 阿里云示例(需配置AK)from aliyunsdkcore.client import AcsClientfrom aliyunsdknls_cloud_meta.request.v20190228 import SubmitTtsTaskRequestclient = AcsClient('<access_key_id>', '<access_secret>', 'default')request = SubmitTtsTaskRequest.SubmitTtsTaskRequest()request.set_Text("商业级语音合成")request.set_Voice("xiaoyun")response = client.do_action_with_exception(request)
特点:
def parse_srt(file_path):entries = []with open(file_path, 'r', encoding='utf-8') as f:current_entry = {}for line in f:line = line.strip()if not line:if current_entry:entries.append(current_entry)current_entry = {}continueif line.isdigit(): # 序号current_entry['id'] = int(line)elif '-->' in line: # 时间轴start, end = line.split(' --> ')current_entry['start'] = parse_time(start)current_entry['end'] = parse_time(end)else: # 文本内容if 'text' not in current_entry:current_entry['text'] = lineelse:current_entry['text'] += '\n' + linereturn entriesdef parse_time(time_str):hh, mm, ss_ms = time_str.split(':')ss, ms = ss_ms.split(',')return float(hh)*3600 + float(mm)*60 + float(ss) + float(ms)/1000
SS,mmm)
import asynciofrom edge_tts import Communicateimport osfrom pydub import AudioSegmentfrom pydub.playback import playasync def process_srt(srt_path, output_dir="audio_segments"):os.makedirs(output_dir, exist_ok=True)entries = parse_srt(srt_path)tasks = []for entry in entries:audio_path = os.path.join(output_dir, f"{entry['id']}.mp3")communicate = Communicate(text=entry['text'],voice="zh-CN-YunxiNeural",rate="+0%")task = asyncio.create_task(communicate.save(audio_path))tasks.append((task, entry['start'], audio_path))await asyncio.gather(*[t[0] for t in tasks])# 合并音频(需按时间排序)sorted_tasks = sorted(tasks, key=lambda x: x[1])full_audio = AudioSegment.silent(duration=0)for _, start_time, path in sorted_tasks:segment = AudioSegment.from_mp3(path)# 计算前一段的静音时长(简化处理)full_audio += AudioSegment.silent(duration=int((start_time - len(full_audio)/1000)*1000))full_audio += segmentfull_audio.export("final_output.mp3", format="mp3")return "final_output.mp3"
asyncio.Semaphore限制并发请求数
FROM python:3.9-slimWORKDIR /appCOPY requirements.txt .RUN pip install --no-cache-dir -r requirements.txtCOPY . .CMD ["python", "main.py"]
encoding='utf-8-sig'参数rate="+0%"保持语速稳定python -m venv venv模块选择原则:
开发流程建议:
测试要点:
通过上述方法,开发者可以构建出稳定、高效的SRT转语音系统,满足从个人学习到商业应用的多样化需求。实际开发中建议结合具体场景选择技术栈,并注意处理异常情况和性能优化。