#!/usr/bin/env python3
"""Video Narration - TTS Generation and Video Assembly

This script generates TTS audio segments using Azure Speech SDK,
positions them at specified timestamps, and merges them onto a video.

Usage:
    1. Fill in VOICE_NAME, INPUT_VIDEO, OUTPUT_VIDEO, and SEGMENTS
    2. Ensure AZURE_SPEECH_KEY is set in environment (or .env file)
    3. Run: python3 narration_script.py
"""

import os
import subprocess
import json
from dotenv import load_dotenv
import azure.cognitiveservices.speech as speechsdk

load_dotenv(os.path.expanduser("~/.narrate_video.env"))

# ── Configuration ──────────────────────────────────────────────────────────
SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY", "")
SERVICE_REGION = os.environ.get("AZURE_SPEECH_REGION", "eastus2")
VOICE_NAME = "REPLACE_WITH_VOICE"         # e.g. en-US-AndrewMultilingualNeural
INPUT_VIDEO = "REPLACE_WITH_INPUT"        # Relative path only
OUTPUT_VIDEO = "REPLACE_WITH_OUTPUT"      # Relative path only
SEGMENTS_DIR = "narration_segments"       # Relative path only

# ── Narration Segments ─────────────────────────────────────────────────────
# Each entry: (start_seconds, "narration text")
# Fill from Phase 2 script writing
SEGMENTS = []


# ── TTS Generation ─────────────────────────────────────────────────────────
def generate_segment(idx, text, output_path):
    """Generate a single audio segment using Azure TTS."""
    speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SERVICE_REGION)
    speech_config.speech_synthesis_voice_name = VOICE_NAME
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_path)
    synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=audio_config
    )
    result = synthesizer.speak_text_async(text).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print(f"  [OK] Segment {idx}: {output_path}")
        return True
    details = result.cancellation_details
    print(f"  [FAIL] Segment {idx}: {details.reason} - {details.error_details}")
    return False


def get_audio_duration(path):
    """Get duration of an audio file in seconds via ffprobe."""
    result = subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", path],
        capture_output=True, text=True
    )
    return float(json.loads(result.stdout)["format"]["duration"])


def build_narrated_video():
    """Generate TTS segments, check timing, and assemble the narrated video."""
    os.makedirs(SEGMENTS_DIR, exist_ok=True)

    # Step 1: Generate audio segments (skips existing files)
    print("=== Step 1: Generating audio segments ===")
    segment_files = []
    for i, (start, text) in enumerate(SEGMENTS):
        out_path = os.path.join(SEGMENTS_DIR, f"seg_{i:03d}.mp3")
        if not os.path.exists(out_path):
            if not generate_segment(i, text, out_path):
                return False
        else:
            print(f"  [SKIP] Segment {i}: already exists")
        segment_files.append((start, out_path))

    # Step 2: Get video duration
    video_duration = float(json.loads(subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", INPUT_VIDEO],
        capture_output=True, text=True
    ).stdout)["format"]["duration"])
    print(f"\nVideo duration: {video_duration:.1f}s")

    # Step 3: Check timing overlaps — abort on any overlap
    print("\n=== Step 2: Checking segment timings ===")
    has_overlap = False
    for i, (start, path) in enumerate(segment_files):
        dur = get_audio_duration(path)
        end = start + dur
        next_start = segment_files[i + 1][0] if i + 1 < len(segment_files) else video_duration
        gap = next_start - end
        status = "OK" if gap >= 0 else "OVERLAP"
        print(f"  Seg {i:2d}: {start:6.1f}s - {end:6.1f}s (dur: {dur:5.1f}s) gap: {gap:+.1f}s [{status}]")
        if gap < 0:
            has_overlap = True
            print(f"    WARNING: Overlap of {-gap:.1f}s with next segment!")
    if has_overlap:
        print("\nERROR: Fix overlaps before proceeding.")
        return False

    # Step 4: Build ffmpeg command
    print("\n=== Step 3: Building narrated video ===")
    inputs = ["-i", INPUT_VIDEO]
    for _, path in segment_files:
        inputs.extend(["-i", path])

    filter_parts = []
    n = len(segment_files)
    for i, (start, _) in enumerate(segment_files):
        delay_ms = int(start * 1000)
        filter_parts.append(f"[{i+1}:a]adelay={delay_ms}|{delay_ms}[a{i}]")

    # normalize=0 is essential: without it, amix divides volume by input count,
    # so 20 segments would reduce audio to 1/20th volume — nearly silent.
    mix_inputs = "".join(f"[a{i}]" for i in range(n))
    filter_parts.append(
        f"{mix_inputs}amix=inputs={n}:duration=longest"
        f":dropout_transition=0:normalize=0[final]"
    )

    # Original video audio is completely discarded — only the narration track
    # is mapped. Mixing original audio even at low volume causes audible
    # double-voice artifacts because the narration bleeds through both tracks.
    filter_complex = ";".join(filter_parts)
    cmd = [
        "ffmpeg", "-y",
        *inputs,
        "-filter_complex", filter_complex,
        "-map", "0:v",
        "-map", "[final]",
        "-c:v", "copy",       # Copy video without re-encoding
        "-c:a", "aac", "-b:a", "192k",
        "-shortest",
        OUTPUT_VIDEO
    ]

    print("  Running ffmpeg...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"  ffmpeg error:\n{result.stderr[-2000:]}")
        return False

    print(f"\n=== Done! Output: {OUTPUT_VIDEO} ===")
    return True


def check_inputs():
    """Validate all required configuration before running."""
    errors = []
    if not SPEECH_KEY:
        errors.append("AZURE_SPEECH_KEY not found. Add it to ~/.narrate_video.env")
    if not os.environ.get("AZURE_SPEECH_REGION"):
        errors.append("AZURE_SPEECH_REGION not found. Add it to ~/.narrate_video.env")
    if VOICE_NAME == "REPLACE_WITH_VOICE":
        errors.append("VOICE_NAME not set. Replace the placeholder with a voice name (e.g. en-US-AndrewMultilingualNeural)")
    if INPUT_VIDEO == "REPLACE_WITH_INPUT":
        errors.append("INPUT_VIDEO not set. Replace the placeholder with the input video path")
    elif not os.path.isfile(INPUT_VIDEO):
        errors.append(f"INPUT_VIDEO not found: {INPUT_VIDEO}")
    if OUTPUT_VIDEO == "REPLACE_WITH_OUTPUT":
        errors.append("OUTPUT_VIDEO not set. Replace the placeholder with the output video path")
    if not SEGMENTS:
        errors.append("SEGMENTS is empty. Add at least one (start_seconds, text) tuple")
    for i, seg in enumerate(SEGMENTS):
        if not isinstance(seg, (list, tuple)) or len(seg) != 2:
            errors.append(f"SEGMENTS[{i}]: must be a (start_seconds, text) tuple")
        elif not isinstance(seg[0], (int, float)) or seg[0] < 0:
            errors.append(f"SEGMENTS[{i}]: start_seconds must be a non-negative number")
        elif not isinstance(seg[1], str) or not seg[1].strip():
            errors.append(f"SEGMENTS[{i}]: text must be a non-empty string")
    if errors:
        print("ERROR: Fix the following before running:\n")
        for e in errors:
            print(f"  - {e}")
        return False
    return True


if __name__ == "__main__":
    if not check_inputs():
        exit(1)
    build_narrated_video()