#!/usr/bin/env python3 """ Transcribe audio using Whisper and output a JSON file with per-sentence timestamps. Supports two engines (auto-detected): - faster-whisper (recommended, 4x faster, less memory) - openai-whisper (fallback) Usage: python3 transcribe.py [--model auto] [--language zh] [--engine auto] Output: /_transcript.json """ import argparse import json import os import sys # Allow importing utils from the same directory sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from utils import ( detect_gpu, detect_whisper_engine, recommend_whisper_model, get_whisper_device, setup_china_env, detect_platform, ) # Common filler words/phrases by language FILLER_PATTERNS = { "zh": [ "嗯", "呃", "额", "啊", "哦", "唔", "那个", "就是", "就是说", "然后呢", "怎么说呢", "对吧", "你知道吗", "我觉得吧", "基本上", ], "en": [ "um", "uh", "erm", "ah", "oh", "like", "you know", "i mean", "basically", "actually", "literally", "right", "so yeah", "kind of", "sort of", ], } def transcribe_faster_whisper(audio_path, model_name, language, device, compute_type, word_timestamps=False): """Transcribe using faster-whisper engine.""" from faster_whisper import WhisperModel print(f"[faster-whisper] Loading model: {model_name} (device={device}, compute={compute_type})") model = WhisperModel(model_name, device=device, compute_type=compute_type) kwargs = {"word_timestamps": word_timestamps} if language: kwargs["language"] = language segments_iter, info = model.transcribe(audio_path, **kwargs) detected_lang = info.language segments = [] for i, seg in enumerate(segments_iter, start=1): entry = { "id": i, "start": round(seg.start, 2), "end": round(seg.end, 2), "text": seg.text.strip(), } if word_timestamps and seg.words: entry["words"] = [ { "word": w.word.strip(), "start": round(w.start, 3), "end": round(w.end, 3), } for w in seg.words if w.word.strip() ] segments.append(entry) return segments, detected_lang def transcribe_openai_whisper(audio_path, model_name, language, word_timestamps=False): """Transcribe using openai-whisper engine.""" import whisper print(f"[openai-whisper] Loading model: {model_name}") model = whisper.load_model(model_name) kwargs = {"word_timestamps": word_timestamps} if language: kwargs["language"] = language result = model.transcribe(audio_path, **kwargs) detected_lang = result.get("language", "unknown") segments = [] for i, seg in enumerate(result.get("segments", []), start=1): entry = { "id": i, "start": round(seg["start"], 2), "end": round(seg["end"], 2), "text": seg["text"].strip(), } if word_timestamps and seg.get("words"): entry["words"] = [ { "word": w["word"].strip(), "start": round(w["start"], 3), "end": round(w["end"], 3), } for w in seg["words"] if w.get("word", "").strip() ] segments.append(entry) return segments, detected_lang def detect_silences(segments, min_gap=1.0): """Detect silent gaps between speech segments. Args: segments: List of {"id", "start", "end", "text"} dicts. min_gap: Minimum gap duration (seconds) to flag as silence. Returns: List of {"start", "end", "duration", "before_segment", "after_segment"} dicts. """ silences = [] for i in range(1, len(segments)): gap_start = segments[i - 1]["end"] gap_end = segments[i]["start"] gap_dur = gap_end - gap_start if gap_dur >= min_gap: silences.append({ "start": round(gap_start, 2), "end": round(gap_end, 2), "duration": round(gap_dur, 2), "before_segment": segments[i - 1]["id"], "after_segment": segments[i]["id"], }) return silences def detect_filler_words(segments, language="zh"): """Detect filler words/phrases in transcript segments. Args: segments: List of {"id", "start", "end", "text"} dicts. language: Language code ("zh", "en", etc.) Returns: List of {"segment_id", "text", "fillers_found", "is_filler_only"} dicts. """ import re lang_key = "zh" if language and language.startswith("zh") else "en" patterns = FILLER_PATTERNS.get(lang_key, FILLER_PATTERNS["en"]) results = [] for seg in segments: text = seg["text"].strip() text_lower = text.lower() found = [] for filler in patterns: if lang_key == "zh": if filler in text: found.append(filler) else: if re.search(r'\b' + re.escape(filler) + r'\b', text_lower): found.append(filler) if found: clean = text_lower for f in patterns: if lang_key == "zh": clean = clean.replace(f, "") else: clean = re.sub(r'\b' + re.escape(f) + r'\b', '', clean) clean = re.sub(r'[^\w]', '', clean).strip() is_filler_only = len(clean) == 0 or (len(text) <= 6 and len(found) > 0) results.append({ "segment_id": seg["id"], "text": text, "fillers_found": found, "is_filler_only": is_filler_only, }) return results def main(): parser = argparse.ArgumentParser(description="Transcribe audio with Whisper") parser.add_argument("audio_path", help="Path to the audio file (.wav)") parser.add_argument("--model", default="auto", help="Whisper model size: tiny/base/small/medium/large-v3/large-v3-turbo/auto (default: auto)") parser.add_argument("--language", default=None, help="Language code (e.g. zh, en, ja). Omit for auto-detection.") parser.add_argument("--engine", default="auto", choices=["auto", "faster-whisper", "openai-whisper"], help="Whisper engine (default: auto-detect)") parser.add_argument("--mirror", action="store_true", help="Use China mirrors for model download") parser.add_argument("--silence-threshold", type=float, default=1.0, help="Min gap (seconds) to flag as silence (default: 1.0). Set 0 to disable.") parser.add_argument("--word-timestamps", action="store_true", help="Include per-word timestamps (required for karaoke subtitles)") parser.add_argument("--detect-fillers", action="store_true", help="Detect filler words (um, uh, 嗯, 呃, etc.) and mark segments") args = parser.parse_args() audio_path = os.path.abspath(args.audio_path) if not os.path.isfile(audio_path): print(f"Error: File not found: {audio_path}", file=sys.stderr) sys.exit(1) # China mirror setup if args.mirror: os.environ["USE_CN_MIRROR"] = "1" setup_china_env() # Detect GPU & hardware gpu_info = detect_gpu() # Choose engine if args.engine == "auto": engine = detect_whisper_engine() if engine == "none": print("Error: No Whisper engine found.", file=sys.stderr) print("Install one of:", file=sys.stderr) print(" pip install faster-whisper (recommended, 4x faster)", file=sys.stderr) print(" pip install openai-whisper", file=sys.stderr) sys.exit(1) else: engine = args.engine # Choose model if args.model == "auto": model_name, reason = recommend_whisper_model(gpu_info) print(f"[auto] Selected model: {model_name} ({reason})") else: model_name = args.model print(f"Engine: {engine}") print(f"Model: {model_name}") print(f"GPU: {gpu_info['type']}") print(f"Transcribing: {audio_path}") # Run transcription if args.word_timestamps: print("Word-level timestamps: enabled (for karaoke subtitles)") if engine == "faster-whisper": device, compute_type = get_whisper_device(gpu_info) segments, detected_lang = transcribe_faster_whisper( audio_path, model_name, args.language, device, compute_type, word_timestamps=args.word_timestamps, ) else: segments, detected_lang = transcribe_openai_whisper( audio_path, model_name, args.language, word_timestamps=args.word_timestamps, ) # Build output path audio_dir = os.path.dirname(audio_path) audio_name = os.path.splitext(os.path.basename(audio_path))[0] video_name = audio_name.replace("_audio", "") output_path = os.path.join(audio_dir, f"{video_name}_transcript.json") # Detect silences silences = [] if args.silence_threshold > 0 and len(segments) > 1: silences = detect_silences(segments, min_gap=args.silence_threshold) output_data = { "source_audio": audio_path, "engine": engine, "model": model_name, "language": detected_lang, "segments": segments, } if silences: output_data["silences"] = silences # Detect filler words fillers = [] if args.detect_fillers: fillers = detect_filler_words(segments, detected_lang) if fillers: output_data["filler_words"] = fillers with open(output_path, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) print(f"\nTranscription complete: {output_path}") print(f"Total segments: {len(segments)}") # Report silences if silences: total_silence = sum(s["duration"] for s in silences) print(f"\nDetected {len(silences)} silent gaps (>= {args.silence_threshold}s), total {total_silence:.1f}s:") for s in silences: print(f" {s['start']:7.2f}s - {s['end']:7.2f}s ({s['duration']:.1f}s gap, between #{s['before_segment']} and #{s['after_segment']})") print(f"\nTip: These gaps are likely stammers, pauses, or hesitations.") print(f" Exclude these segment ranges when building render_config.json.") else: print(f"\nNo significant silences detected (threshold: {args.silence_threshold}s).") # Report fillers if args.detect_fillers and fillers: filler_only = [f for f in fillers if f["is_filler_only"]] print(f"\nDetected filler words in {len(fillers)} segments ({len(filler_only)} filler-only):") for f in fillers[:10]: marker = " ← SKIP" if f["is_filler_only"] else "" print(f" #{f['segment_id']:3d} [{', '.join(f['fillers_found'])}] \"{f['text']}\"{marker}") if len(fillers) > 10: print(f" ... and {len(fillers) - 10} more") print("\nSegment preview:") for seg in segments[:5]: print(f" #{seg['id']:3d} [{seg['start']:7.2f}s - {seg['end']:7.2f}s] {seg['text']}") if len(segments) > 5: print(f" ... and {len(segments) - 5} more segments") if __name__ == "__main__": main()