上传文件至 /

2025-03-09 14:58:32 +08:00 · 2025-03-09 14:58:32 +08:00 · 4d6c36f5a4
commit 4d6c36f5a4
parent 5f593fca2e
1 changed files with 62 additions and 0 deletions
--- a/transcribe_audio.py
+++ b/transcribe_audio.py
@ -0,0 +1,62 @@
+import os
+import whisper
+from tqdm import tqdm
+import torch
+import re
+import opencc
+
+def transcribe_audio_folder(input_dir, output_dir):
+    
+    # 加载多语言大模型（使用GPU加速）
+    model = whisper.load_model("base", device='cuda')
+    
+    # 确保输出目录存在
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # 初始化 OpenCC 转换器
+    converter = opencc.OpenCC('t2s.json')
+    
+    # 遍历目录中的所有音频文件
+    audio_exts = ['.mp3', '.wav', '.m4a', '.flac']
+    audio_files = [f for f in os.listdir(input_dir) 
+                  if os.path.splitext(f)[1].lower() in audio_exts]
+    
+    # 批量转写
+    for filename in tqdm(audio_files):
+        file_path = os.path.join(input_dir, filename)
+        try:
+            # 使用GPU加速（fp16精度）
+            result = model.transcribe(
+                file_path,
+                language="zh",
+                task="transcribe",
+                fp16=('cuda' == "cuda"),
+                verbose=False
+            )
+            
+            # 将繁体中文转换为简体中文
+            simplified_text = converter.convert(result["text"])
+            
+            # 添加断句
+            sentence_endings = re.compile(r'([。！？])')
+            sentences = sentence_endings.split(simplified_text)
+            sentences = [s.strip() for s in sentences if s.strip()]
+            formatted_text = '\n'.join(sentences)
+            
+            # 生成输出文件名
+            base_name = os.path.splitext(filename)[0]
+            output_path = os.path.join(output_dir, f"{base_name}.txt")
+            
+            # 保存简体中文文本
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(formatted_text)
+                
+        except Exception as e:
+            print(f"处理文件 {filename} 时出错: {str(e)}")
+
+if __name__ == "__main__":
+    # 使用示例
+    transcribe_audio_folder(
+        input_dir="/home/mei/work/asr/data",
+        output_dir="/home/mei/work/asr/out"
+    )