import os import whisper from tqdm import tqdm import torch import re import opencc def transcribe_audio_folder(input_dir, output_dir): # 加载多语言大模型(使用GPU加速) model = whisper.load_model("base", device='cuda') # 确保输出目录存在 os.makedirs(output_dir, exist_ok=True) # 初始化 OpenCC 转换器 converter = opencc.OpenCC('t2s.json') # 遍历目录中的所有音频文件 audio_exts = ['.mp3', '.wav', '.m4a', '.flac'] audio_files = [f for f in os.listdir(input_dir) if os.path.splitext(f)[1].lower() in audio_exts] # 批量转写 for filename in tqdm(audio_files): file_path = os.path.join(input_dir, filename) try: # 使用GPU加速(fp16精度) result = model.transcribe( file_path, language="zh", task="transcribe", fp16=('cuda' == "cuda"), verbose=False ) # 将繁体中文转换为简体中文 simplified_text = converter.convert(result["text"]) # 添加断句 sentence_endings = re.compile(r'([。!?])') sentences = sentence_endings.split(simplified_text) sentences = [s.strip() for s in sentences if s.strip()] formatted_text = '\n'.join(sentences) # 生成输出文件名 base_name = os.path.splitext(filename)[0] output_path = os.path.join(output_dir, f"{base_name}.txt") # 保存简体中文文本 with open(output_path, "w", encoding="utf-8") as f: f.write(formatted_text) except Exception as e: print(f"处理文件 {filename} 时出错: {str(e)}") if __name__ == "__main__": # 使用示例 transcribe_audio_folder( input_dir="/home/mei/work/asr/data", output_dir="/home/mei/work/asr/out" )