#!/usr/bin/env python3
"""analysis_orchestrator.py — Single entry point for analyzing any input.

Takes any combination of inputs and runs the right analysis scripts:
- Audio file → analyze_vocal_emotion.py + analyze_audio.py + (optional) extract_lyrics_whisper.py
- Two audio files → analyze_two_songs.py + vocal emotion on Song A
- Video file → extract_video_features.py + vocal emotion on the extracted audio
- Image file → analyze_image.py
- YouTube URL → download with the standalone download_youtube.py, then treat as audio

Output: a unified JSON containing all the analyses, ready for emotion_to_prompt.py
or direct use by the LLM.

Usage:
    # Single audio (with lyrics)
    python3 analysis_orchestrator.py --audio /tmp/song.wav --lyrics --output /tmp/analysis.json

    # Two audios (mashup)
    python3 analysis_orchestrator.py --audio /tmp/a.wav --audio /tmp/b.wav \\
        --name-a "Song A" --name-b "Song B" --output /tmp/analysis.json

    # Video
    python3 analysis_orchestrator.py --video /tmp/clip.mp4 --output /tmp/analysis.json

    # Image
    python3 analysis_orchestrator.py --image /tmp/album_art.jpg --output /tmp/analysis.json

    # YouTube URL (downloads first, --lyrics optional)
    python3 analysis_orchestrator.py --youtube "https://youtube.com/watch?v=..." --lyrics --output /tmp/analysis.json

    # Combination: audio + image (Song A audio, plus album art for style cues)
    python3 analysis_orchestrator.py --audio /tmp/song.wav --image /tmp/art.jpg \\
        --output /tmp/analysis.json
"""
import sys
import os
import json
import re
import argparse
import subprocess
import tempfile

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPT_DIR)

from _key_compat import mashup_compatibility


_YT_NOISE = [
    # Common YouTube filename noise that pollutes song identification.
    r"\(official\s+(?:music\s+)?video\)",
    r"\(official\s+audio\)",
    r"\(official\s+lyric\s+video\)",
    r"\(official\s+lyrics\)",
    r"\(lyric\s+video\)",
    r"\(lyrics?\)",
    r"\(music\s+video\)",
    r"\(hd\s+(?:hq\s+)?video\)",
    r"\[official\s+(?:music\s+)?video\]",
    r"\[official\s+lyric\s+video\]",
    r"\[lyric\s+video\]",
    r"\[lyrics?\]",
    r"\[hq\]",
    r"\[hd\]",
    r"\[4k\]",
    r"\[1080p\]",
    r"\(remastered(?:\s+\d{4})?\)",
    r"\(remaster\)",
    r"\(live(?:\s+at\s+[^)]+)?\)",
    r"\(audio\)",
    r"\(mono\)",
    r"\(stereo\)",
]
_YT_NOISE_RE = re.compile("|".join(_YT_NOISE), re.IGNORECASE)


def _strip_youtube_noise(text: str) -> str:
    """Remove common YouTube parenthetical/bracket noise like '(Official Video)'."""
    if not text:
        return text
    cleaned = _YT_NOISE_RE.sub("", text)
    # Collapse leftover whitespace and dangling punctuation
    cleaned = re.sub(r"\s+", " ", cleaned)
    cleaned = re.sub(r"\s+([\-\u2013\u2014])\s+", r" \1 ", cleaned)
    return cleaned.strip(" \t-_\u2013\u2014.")


def _parse_song_stem(stem: str) -> tuple:
    """Parse 'Artist - Title [noise]' into (artist, title) with noise stripped.

    Returns ("", "") if the stem does not contain a separator.
    The separator is the FIRST ' - ' (with surrounding spaces) so the
    artist can contain dashes. The title side is then cleaned of
    common YouTube suffix noise.
    """
    if not stem:
        return ("", "")
    # Take the first " - " only — everything after is the title even
    # if it contains further " - " (e.g. "Song - Acoustic - Live")
    if " - " not in stem:
        return ("", "")
    artist, title = stem.split(" - ", 1)
    artist = artist.strip()
    title = _strip_youtube_noise(title).strip()
    if not artist or not title:
        return ("", "")
    return (artist, title)

AUDIO_EXTENSIONS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac', '.wma', '.opus'}


def _normalize_output_base(path):
    root, ext = os.path.splitext(path)
    return root if ext.lower() in AUDIO_EXTENSIONS else path


def _lazy_import_analyze_audio():
    from analyze_audio import analyze as _analyze
    return _analyze


def _lazy_import_analyze_song():
    from analyze_two_songs import analyze_song as _analyze_song
    return _analyze_song


def _run_vocal_emotion(audio_path, use_demucs=False):
    """Run vocal emotion analysis via direct function call (no stdout capture).

    If use_demucs is True, runs Demucs source separation first to extract a
    clean vocal stem, then analyzes the stem. This dramatically improves
    pitch tracking, HNR, and silence detection on busy mixes.

    Returns the analysis dict, or {'error': '...'} on failure.
    """
    target_path = audio_path
    demucs_info = None
    if use_demucs:
        try:
            from extract_stems import separate_stems
            print("Running Demucs source separation (--use-demucs)...", file=sys.stderr)
            demucs_result = separate_stems(
                audio_path=audio_path,
                model_name='htdemucs',
                target_stems=['vocals'],
            )
            if 'error' not in demucs_result and 'stems' in demucs_result:
                vocals_path = demucs_result['stems'].get('vocals')
                if vocals_path and os.path.exists(vocals_path):
                    target_path = vocals_path
                    demucs_info = demucs_result
        except Exception as e:
            print(f"Demucs failed, falling back to mix: {e}", file=sys.stderr)

    try:
        from analyze_vocal_emotion import analyze_audio
        import argparse as _ap
        args = _ap.Namespace(
            audio=target_path,
            sections=None,
            output=None,
            hop_length=512,
        )
        result = analyze_audio(args)
        if demucs_info and isinstance(result, dict):
            result['_demucs'] = {
                'model': demucs_info.get('model'),
                'vocals_stem': demucs_info.get('stems', {}).get('vocals'),
                'cache': demucs_info.get('cache'),
            }
        return result
    except SystemExit:
        return {'error': 'vocal emotion analysis exited via SystemExit'}
    except Exception as e:
        return {'error': f'vocal emotion analysis failed: {e}'}


def _run_lyrics_extraction(audio_path, model='base'):
    """Run Whisper lyrics extraction. Returns dict, or {'note': '...'} on missing dep."""
    try:
        from extract_lyrics_whisper import transcribe
        return transcribe(audio_path, model_name=model)
    except Exception as e:
        return {'error': f'lyrics extraction failed: {e}'}


def _run_web_lyrics_lookup(artist, title, whisper_text='', min_match=0.6):
    """Optionally look up song lyrics from LRCLib. Returns dict, never raises.

    Returns a dict in the same shape as fetch_lyrics_web.fetch_lyrics_web:
        {
            "status": "matched" | "no_web_lyrics" | "low_match"
                    | "unverified" | "network_error",
            "lyrics": str | None,
            ...
        }
    """
    try:
        from fetch_lyrics_web import fetch_lyrics_web
        return fetch_lyrics_web(
            artist=artist,
            title=title,
            whisper_transcript=whisper_text,
            min_match=min_match,
        )
    except Exception as e:
        return {
            "status": "network_error",
            "lyrics": None,
            "source": "lrclib",
            "match_score": None,
            "artist": artist,
            "title": title,
            "album": None,
            "duration": None,
            "error": f"web lyrics lookup crashed: {e}",
        }


def _run_beat_tracking(audio_path):
    """Run beat_this beat + downbeat tracking. Returns dict."""
    try:
        from track_beats import track_beats
        return track_beats(audio_path, device='cpu')
    except Exception as e:
        return {'error': f'beat tracking failed: {e}'}


def _run_melody_extraction(audio_path, max_seconds=300):
    """Run Basic Pitch polyphonic AMT. Returns dict."""
    try:
        from extract_melody import extract_melody
        return extract_melody(audio_path)
    except Exception as e:
        return {'error': f'melody extraction failed: {e}'}


def _run_mert_embedding(audio_path, max_seconds=120):
    """Run MERT music embedding. Returns dict."""
    try:
        from compute_audio_embedding import compute_mert_embedding
        return compute_mert_embedding(audio_path, device='cpu', max_seconds=max_seconds)
    except Exception as e:
        return {'error': f'MERT embedding failed: {e}'}


def _run_ast_classification(audio_path, top_k=15):
    """Run AST instrument classification. Returns dict."""
    try:
        from classify_instruments import classify_instruments
        return classify_instruments(audio_path, top_k=top_k, device='cpu')
    except Exception as e:
        return {'error': f'AST classification failed: {e}'}


def analyze_audio_file(audio_path, run_emotion=True, run_lyrics=False, lyrics_model='base',
                       use_demucs=False, run_advanced=True,
                       lyrics_source='whisper', lyrics_artist='', lyrics_title='',
                       lyrics_web_min_match=0.6):
    """Run all audio analyses and return a combined result.

    If run_advanced is True (default), also runs beat_this, Basic Pitch,
    MERT, and AST classification. Each is best-effort — failure is captured
    in the result dict, not raised.

    lyrics_source controls the lyrics extraction strategy:
    - "whisper" (default): use Whisper only
    - "web": use LRCLib only (skipped for instrumentals)
    - "auto": Whisper first, then optionally fall back to LRCLib
    - "off": skip lyrics entirely
    """
    result = {
        'audio_file': audio_path,
        'basic_features': None,
        'vocal_emotion': None,
        'lyrics': None,
        'beats': None,
        'melody': None,
        'mert_embedding': None,
        'ast_classification': None,
    }
    try:
        run_analyze_audio = _lazy_import_analyze_audio()
        result['basic_features'] = run_analyze_audio(audio_path)
    except Exception as e:
        result['basic_features'] = {'error': str(e)}

    if run_emotion:
        result['vocal_emotion'] = _run_vocal_emotion(audio_path, use_demucs=use_demucs)

    if run_lyrics:
        whisper_result = None
        web_result = None
        if lyrics_source in ('whisper', 'auto'):
            # In auto mode, only the match score against the web lookup
            # is needed. The 'tiny' model is ~3x faster than 'base' and
            # produces enough transcript for word-overlap scoring. If the
            # web lookup fails to match, we re-run with the user's
            # requested model below.
            model_for_score = 'tiny' if lyrics_source == 'auto' else lyrics_model
            whisper_result = _run_lyrics_extraction(audio_path, model=model_for_score)
        if lyrics_source in ('web', 'auto') and lyrics_artist and lyrics_title:
            whisper_text = ''
            if isinstance(whisper_result, dict):
                whisper_text = whisper_result.get('raw_transcript', '') or ''
            web_result = _run_web_lyrics_lookup(
                artist=lyrics_artist,
                title=lyrics_title,
                whisper_text=whisper_text,
                min_match=lyrics_web_min_match,
            )

        # Decide which lyrics to surface
        if lyrics_source == 'whisper':
            result['lyrics'] = whisper_result
        elif lyrics_source == 'web':
            result['lyrics'] = {
                'web_lookup': web_result,
                'whisper': whisper_result,  # may be None
            } if web_result else whisper_result
        elif lyrics_source == 'auto':
            # Auto: prefer web if it matched; otherwise re-run Whisper
            # with the user's chosen model (the initial 'tiny' run was
            # only for scoring). In any case, preserve the web_lookup
            # info in the result so the caller can see what was tried.
            if web_result and web_result.get('status') == 'matched':
                # Web won. Re-run Whisper with the user's model for
                # fallback only if a user-facing prompt will need it.
                # Skipping the re-run keeps auto fast.
                result['lyrics'] = {
                    'source': 'web',
                    'web_lookup': web_result,
                    'whisper': whisper_result,  # tiny run, kept for inspection
                }
            else:
                # Web missed. Re-run Whisper with the user's model so
                # the user gets the quality they asked for.
                if lyrics_model != 'tiny':
                    whisper_result = _run_lyrics_extraction(
                        audio_path, model=lyrics_model
                    )
                result['lyrics'] = {
                    'source': 'whisper',
                    'whisper': whisper_result,
                    'web_lookup': web_result,  # may be None or low_match
                }
        else:
            result['lyrics'] = None

    if run_advanced:
        result['beats'] = _run_beat_tracking(audio_path)
        result['melody'] = _run_melody_extraction(audio_path)
        result['mert_embedding'] = _run_mert_embedding(audio_path)
        result['ast_classification'] = _run_ast_classification(audio_path)

    return result


def download_youtube(url, output_path, prefer_standalone=True):
    """Download audio from a YouTube URL.

    prefer_standalone=True uses the more featureful scripts/download_youtube.py
    (auto-installs yt-dlp, robust error handling, metadata extraction).
    Returns the path to the downloaded WAV, or None on failure.
    """
    if prefer_standalone:
        try:
            from download_youtube import download_youtube as _standalone
            return _standalone(url, output_path)
        except (ImportError, AttributeError):
            pass
    try:
        output_base = _normalize_output_base(output_path)
        result = subprocess.run(
            ['yt-dlp', '-x', '--audio-format', 'wav', '-o', f'{output_base}.%(ext)s', url],
            capture_output=True, text=True, timeout=300,
        )
        if result.returncode == 0:
            for ext in AUDIO_EXTENSIONS:
                candidate = f'{output_base}{ext}'
                if os.path.exists(candidate):
                    return candidate
            if os.path.exists(output_path):
                return output_path
    except (FileNotFoundError, subprocess.TimeoutExpired):
        pass
    return None


def orchestrate(audios=(), video=None, image=None, youtubes=(), name_a=None, name_b=None,
                run_lyrics=False, lyrics_model='base', use_demucs=False,
                run_advanced=True, image_ocr=False, image_faces=False, image_vlm=False,
                lyrics_source='whisper', lyrics_web_min_match=0.6):
    """Run all the requested analyses and return a unified result.

    run_advanced controls beat_this + Basic Pitch + MERT + AST. Default True.
    image_ocr/image_faces/image_vlm control optional image-side analyses.
    """
    result = {
        'inputs': {
            'audio_files': list(audios),
            'video_files': [video] if video else [],
            'image_files': [image] if image else [],
            'youtube_urls': list(youtubes),
            'name_a': name_a,
            'name_b': name_b,
        },
    }

    youtube_paths = []
    youtube_tmpfiles = []
    for url in youtubes:
        tmp_fd, tmp_path = tempfile.mkstemp(suffix='.wav')
        os.close(tmp_fd)
        # Reserve a unique name, then remove the placeholder so yt-dlp does not
        # mistake the empty temp file for an already downloaded destination.
        os.unlink(tmp_path)
        youtube_tmpfiles.append(tmp_path)
        print(f"Downloading YouTube: {url}", file=sys.stderr)
        downloaded = download_youtube(url, tmp_path)
        if downloaded and os.path.exists(downloaded):
            youtube_paths.append(downloaded)
            result['inputs']['audio_files'].append(downloaded)
        else:
            print(f"Failed to download: {url}", file=sys.stderr)
            try:
                os.unlink(tmp_path)
                youtube_tmpfiles.remove(tmp_path)
            except (OSError, ValueError):
                pass

    try:
        all_audio_paths = list(audios) + youtube_paths

        if len(all_audio_paths) >= 2:
            run_analyze_song = _lazy_import_analyze_song()
            song_a_data = run_analyze_song(all_audio_paths[0], name_a or 'Song A')
            song_b_data = run_analyze_song(all_audio_paths[1], name_b or 'Song B')
            compat = mashup_compatibility(song_a_data, song_b_data)
            mashup_result = {
                'song_a': song_a_data,
                'song_b': song_b_data,
                'compatibility': compat,
            }
            if run_lyrics:
                mashup_result['song_a_lyrics'] = _run_lyrics_extraction(
                    all_audio_paths[0], model=lyrics_model
                )
            song_a_emotion = _run_vocal_emotion(all_audio_paths[0], use_demucs=use_demucs)
            if song_a_emotion and not song_a_emotion.get('error'):
                mashup_result['song_a_vocal_emotion'] = song_a_emotion
            result['mashup'] = mashup_result
        elif len(all_audio_paths) == 1:
            # For the single-audio case, optionally try to identify the song
            # for web lyrics lookup. Use the first non-empty of (name-a, audio path stem).
            stem = os.path.splitext(os.path.basename(all_audio_paths[0]))[0]
            if not stem or stem.startswith("audio") or stem.startswith("input"):
                stem = ""
            song_artist, song_title = _parse_song_stem(stem) if stem else ("", "")
            if (not song_artist or not song_title) and name_a:
                song_artist, song_title = _parse_song_stem(name_a)

            result['audio'] = analyze_audio_file(
                all_audio_paths[0],
                run_emotion=True,
                run_lyrics=run_lyrics,
                lyrics_model=lyrics_model,
                use_demucs=use_demucs,
                run_advanced=run_advanced,
                lyrics_source=lyrics_source,
                lyrics_artist=song_artist,
                lyrics_title=song_title,
                lyrics_web_min_match=lyrics_web_min_match,
            )
        else:
            result['audio'] = None  # type: ignore[assignment]

        if video:
            try:
                from extract_video_features import analyze_video
                video_result = analyze_video(video)
                video_audio_path = None
                try:
                    tmp_fd, video_audio_path = tempfile.mkstemp(suffix='.wav')
                    os.close(tmp_fd)
                    cmd = ['ffmpeg', '-y', '-i', video, '-vn', '-ar', '22050',
                           '-ac', '1', '-f', 'wav', video_audio_path]
                    ffmpeg_result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
                    if ffmpeg_result.returncode == 0 and os.path.exists(video_audio_path):
                        video_emotion = _run_vocal_emotion(video_audio_path, use_demucs=use_demucs)
                        if video_emotion and not video_emotion.get('error'):
                            video_result['vocal_emotion'] = video_emotion
                except (FileNotFoundError, subprocess.TimeoutExpired):
                    pass
                finally:
                    if video_audio_path and os.path.exists(video_audio_path):
                        try:
                            os.unlink(video_audio_path)
                        except OSError:
                            pass
                result['video'] = video_result
            except Exception as e:
                result['video'] = {'error': f'video analysis failed: {e}'}

        if image:
            try:
                from analyze_image import analyze_image
                result['image'] = analyze_image(
                    image,
                    use_ocr=image_ocr,
                    use_faces=image_faces,
                    vlm_caption=image_vlm,
                )
            except Exception as e:
                result['image'] = {'error': f'image analysis failed: {e}'}
    finally:
        for tmp_path in youtube_tmpfiles:
            try:
                if os.path.exists(tmp_path):
                    os.unlink(tmp_path)
            except OSError:
                pass

    return result


def main():
    parser = argparse.ArgumentParser(description='Analyze any music input(s) and produce a unified analysis')
    parser.add_argument('--audio', action='append', default=[],
                        help='Audio file path (can be specified multiple times for mashup)')
    parser.add_argument('--video', help='Video file path')
    parser.add_argument('--image', help='Image file path (album art, etc.)')
    parser.add_argument('--youtube', action='append', default=[],
                        help='YouTube URL (will be downloaded)')
    parser.add_argument('--name-a', help='Name/label for the first song (content source)')
    parser.add_argument('--name-b', help='Name/label for the second song (style reference)')
    parser.add_argument('--lyrics', action='store_true',
                        help='Run Whisper lyrics extraction on audio inputs (slower)')
    parser.add_argument('--lyrics-model', default='base',
                        choices=['tiny', 'base', 'small', 'medium', 'large'],
                        help='Whisper model size for lyrics extraction (default: base)')
    parser.add_argument('--lyrics-source', default='whisper',
                        choices=['whisper', 'web', 'auto', 'off'],
                        help='Lyrics source strategy. "whisper" = always use Whisper. '
                             '"web" = always try LRCLib. "auto" = Whisper first, then '
                             'optionally fall back to LRCLib if the song is recognized. '
                             '"off" = skip lyrics entirely. (default: whisper)')
    parser.add_argument('--lyrics-web-min-match', type=float, default=0.6,
                        help='Minimum word-overlap score to accept web-fetched lyrics '
                             '(0.0-1.0, default 0.6). Lower = more permissive.')
    parser.add_argument('--use-demucs', action='store_true',
                        help='Run Demucs source separation first, then analyze the vocal stem. '
                             'Dramatically improves pitch/HNR/silence detection on busy mixes. '
                             'CPU: ~30s for 4min song, GPU: ~5s. Requires: pip install demucs')
    parser.add_argument('--no-advanced', action='store_true',
                        help='Skip beat_this, Basic Pitch, MERT, and AST analyses (faster, '
                             'less rich). Default: run all advanced analyses.')
    parser.add_argument('--vlm', action='store_true',
                        help='For image inputs, also call "mmx vision describe" (MiniMax 3.0) '
                             'for a free-form caption. Requires the mmx CLI.')
    parser.add_argument('--ocr', action='store_true',
                        help='For image inputs, also run RapidOCR text extraction.')
    parser.add_argument('--faces', action='store_true',
                        help='For image inputs, also run MediaPipe face detection.')
    parser.add_argument('--output', '-o', help='Output JSON file path')
    args = parser.parse_args()

    if not (args.audio or args.video or args.image or args.youtube):
        parser.error('At least one input is required: --audio, --video, --image, or --youtube')

    audio_source_count = len(args.audio) + len(args.youtube)
    if audio_source_count > 2:
        parser.error(
            f'At most two audio sources are supported for mashup analysis; '
            f'got {audio_source_count}. Split the request or treat extra sources as text-only references.'
        )

    result = orchestrate(
        audios=tuple(args.audio),
        video=args.video,
        image=args.image,
        youtubes=tuple(args.youtube),
        name_a=args.name_a,
        name_b=args.name_b,
        run_lyrics=args.lyrics,
        lyrics_model=args.lyrics_model,
        use_demucs=args.use_demucs,
        run_advanced=not args.no_advanced,
        image_ocr=args.ocr,
        image_faces=args.faces,
        image_vlm=args.vlm,
        lyrics_source=args.lyrics_source,
        lyrics_web_min_match=args.lyrics_web_min_match,
    )

    class _NumpyEncoder(json.JSONEncoder):
        def default(self, o):  # noqa: ARG002 (signature must match JSONEncoder)
            try:
                import numpy as np
                if isinstance(o, (np.integer,)):
                    return int(o)
                if isinstance(o, (np.floating,)):
                    return float(o)
                if isinstance(o, np.ndarray):
                    return o.tolist()
            except ImportError:
                pass
            return super().default(o)

    json_out = json.dumps(result, indent=2, ensure_ascii=False, cls=_NumpyEncoder)
    if args.output:
        with open(args.output, 'w') as f:
            f.write(json_out)
        print(f"Written: {args.output}", file=sys.stderr)
    else:
        print(json_out)


if __name__ == '__main__':
    main()