asr/transcribe_audio.py
2025-03-09 14:58:32 +08:00

62 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import whisper
from tqdm import tqdm
import torch
import re
import opencc
def transcribe_audio_folder(input_dir, output_dir):
# 加载多语言大模型使用GPU加速
model = whisper.load_model("base", device='cuda')
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 初始化 OpenCC 转换器
converter = opencc.OpenCC('t2s.json')
# 遍历目录中的所有音频文件
audio_exts = ['.mp3', '.wav', '.m4a', '.flac']
audio_files = [f for f in os.listdir(input_dir)
if os.path.splitext(f)[1].lower() in audio_exts]
# 批量转写
for filename in tqdm(audio_files):
file_path = os.path.join(input_dir, filename)
try:
# 使用GPU加速fp16精度
result = model.transcribe(
file_path,
language="zh",
task="transcribe",
fp16=('cuda' == "cuda"),
verbose=False
)
# 将繁体中文转换为简体中文
simplified_text = converter.convert(result["text"])
# 添加断句
sentence_endings = re.compile(r'([。!?])')
sentences = sentence_endings.split(simplified_text)
sentences = [s.strip() for s in sentences if s.strip()]
formatted_text = '\n'.join(sentences)
# 生成输出文件名
base_name = os.path.splitext(filename)[0]
output_path = os.path.join(output_dir, f"{base_name}.txt")
# 保存简体中文文本
with open(output_path, "w", encoding="utf-8") as f:
f.write(formatted_text)
except Exception as e:
print(f"处理文件 {filename} 时出错: {str(e)}")
if __name__ == "__main__":
# 使用示例
transcribe_audio_folder(
input_dir="/home/mei/work/asr/data",
output_dir="/home/mei/work/asr/out"
)