#!/usr/bin/env python3
"""emotion_to_prompt.py — Convert emotion analysis + style analysis into music generation prompts.

Takes the JSON outputs from:
  - analyze_vocal_emotion.py  (emotion dynamics from Song A)
  - analyze_audio.py or analyze_two_songs.py  (style features from Song B)
  - analysis_orchestrator.py  (full unified output; enables image/video/mashup consumption)

And produces a structured music generation prompt suitable for Suno, Udio,
MiniMax, or other music generation tools.

# DISCARDED (intentionally not consumed) — v0.1.1
# ---------------------------------------------
# analyze_vocal_emotion.py:
#   - emotion_sections[].avg_pitch_hz, pitch_std_hz, pitch_trend_val, max_intensity,
#     intensity_range, voiced_ratio, spectral_contrast (raw numeric per-section values)
#     Reason: only the categorical/classifier outputs are stable; raw values vary too
#     much between recordings to be useful as prompt text.
#   - formant_tracks_available (boolean flag, never used)
#   - per-section jitter_pct, shimmer_pct, hnr_db raw values (global aggregate used
#     instead; per-section "breathier in verse" style aggregation added in v0.1.1)
# analyze_audio.py / analyze_two_songs.py:
#   - beat_count, duration_formatted, approx_beat_count
#     Reason: redundant with BPM and duration_seconds which are already in the prompt.
#   - tempo_feel, key_confidence (text/values, not stable enough to inject)
# analyze_two_songs.py:
#   - song_b_reference.source_file (only used to compute has_song_b_audio; not in prompt)
# emotion_to_prompt.py internal:
#   - style_template_used, emotion_hints_used (dead output fields; removed in v0.1.1)

Usage:
    python3 emotion_to_prompt.py \
        --emotion /tmp/song_a_emotion.json \
        --style /tmp/song_b_style.json \
        [--target-duration 180] \
        [--language english] \
        [--output /tmp/prompt.json]

Output: JSON with:
  - final_prompt: ready-to-use music generation prompt
  - prompt_sections: per-section instructions
  - arrangement_plan: instrument layering guide
"""
import sys
import os
import json
import argparse


# ─── Style-to-prompt templates ──────────────────────────────────

# ─── Style-to-prompt templates ──────────────────────────────────

# MiniMax Music 2.6 “Production Sheet” formula:
#   [Genre/subgenre], [mood], [voice type], [instruments — name ALL],
#   [BPM] BPM in [key], [structure], [production/mix], [things to avoid]
#
# Critical: ALWAYS name all instruments, ALWAYS add “never drop to a cappella”,
# ALWAYS add “avoid sparse minimal arrangements”

STYLE_TEMPLATES = {
    # category: (instruments, mood, vocal_style, era, typical_bpm, key, avoid)
    "french_chanson": {
        "instruments": "accordion, upright bass, orchestral strings, piano",
        "mood": "melancholic romantic, dramatic, theatrical",
        "vocal": "passionate theatrical French vocalist, Edith Piaf style delivery",
        "era": "1960s Paris café",
        "bpm_range": (70, 90),
        "key": "E minor",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections, avoid electronic sounds",
    },
    "rock": {
        "instruments": "electric guitars, bass, drums, rhythm guitar",
        "mood": "energetic, powerful, driving",
        "vocal": "powerful rock vocalist, belting",
        "era": "classic rock",
        "bpm_range": (120, 150),
        "key": "A major",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "acoustic": {
        "instruments": "acoustic guitar, light percussion, upright bass",
        "mood": "intimate, warm, organic",
        "vocal": "soft gentle vocals, close-mic",
        "era": "modern folk",
        "bpm_range": (80, 110),
        "key": "G major",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "epic_orchestral": {
        "instruments": "full symphony strings, brass, timpani, choir",
        "mood": "cinematic, grand, sweeping",
        "vocal": "operatic or cinematic vocals",
        "era": "modern cinematic",
        "bpm_range": (60, 85),
        "key": "D minor",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "jazz": {
        "instruments": "piano, double bass, brushed drums, brass section",
        "mood": "smooth, sophisticated, swing",
        "vocal": "jazz vocalist, scat capable",
        "era": "1940s-1950s jazz club",
        "bpm_range": (140, 180),
        "key": "Bb major",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "latin": {
        "instruments": "acoustic guitar, percussion, brass, bass",
        "mood": "warm, passionate, rhythmic",
        "vocal": "Latin vocalist, emotional delivery",
        "era": "timeless Latin",
        "bpm_range": (90, 130),
        "key": "A minor",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "pop": {
        "instruments": "synths, drums, bass, polished production",
        "mood": "catchy, upbeat, modern",
        "vocal": "clear pop vocals, polished",
        "era": "modern pop",
        "bpm_range": (100, 130),
        "key": "C major",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "blues": {
        "instruments": "electric guitar, harmonica, bass, drums",
        "mood": "soulful, gritty, emotional",
        "vocal": "gritty blues vocals, expressive",
        "era": "classic blues",
        "bpm_range": (70, 100),
        "key": "E major",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "electronic": {
        "instruments": "synthesizers, drum machines, pads, arpeggiators",
        "mood": "atmospheric, pulsing, immersive",
        "vocal": "processed vocals, ethereal or robotic",
        "era": "modern electronic",
        "bpm_range": (110, 140),
        "key": "A minor",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
    "ballad": {
        "instruments": "piano, strings, gentle percussion, bass",
        "mood": "tender, emotional, intimate",
        "vocal": "soft emotive vocals, gradual build",
        "era": "timeless ballad",
        "bpm_range": (60, 90),
        "key": "C minor",
        "avoid": "avoid sparse minimal arrangements, avoid a cappella sections",
    },
}


def infer_style_category(style_data):
    """Infer the style category from audio analysis features.

    Priority: CLAP zero-shot classification > heuristic BPM/energy mapping.
    """
    # Priority 1: CLAP zero-shot classification (most reliable when present)
    clap = style_data.get('clap_classification', {})
    if clap.get('detected'):
        genres = clap.get('top_genres', [])
        if genres:
            top_genre = genres[0][0].lower()  # (label, score) tuple
            genre_to_category = {
                'rock': 'rock', 'pop': 'pop', 'jazz': 'jazz', 'blues': 'blues',
                'electronic': 'electronic', 'hip hop': 'pop', 'rnb': 'pop', 'country': 'acoustic',
                'folk': 'acoustic', 'classical': 'epic_orchestral', 'metal': 'rock', 'punk': 'rock',
                'reggae': 'latin', 'latin': 'latin', 'soul': 'blues', 'funk': 'pop', 'disco': 'pop',
                'house': 'electronic', 'techno': 'electronic', 'synthwave': 'electronic',
                'ambient': 'ballad', 'indie': 'acoustic', 'alternative': 'rock', 'dream pop': 'pop',
                'post-rock': 'epic_orchestral', 'shoegaze': 'rock', 'lo-fi': 'acoustic',
            }
            category = genre_to_category.get(top_genre)
            if category:
                return category

    # Priority 2: Heuristic fallback (BPM + energy + instrument hints)
    bpm = style_data.get('bpm', 100)
    brightness = style_data.get('brightness', 'balanced')
    energy = style_data.get('energy_description', 'moderate')
    hints = style_data.get('instrument_hints', {})

    # Heuristic mapping
    if hints.get('likely_electronic'):
        return "electronic"
    if hints.get('likely_orchestral'):
        return "epic_orchestral"
    if hints.get('likely_acoustic'):
        if bpm < 95:
            return "ballad"
        return "acoustic"

    bpm_str = str(brightness).lower()
    energy_str = str(energy).lower()

    if bpm > 150:
        return "jazz" if 'warm' in bpm_str else "rock"
    if bpm > 120:
        if 'energetic' in energy_str or 'high' in energy_str:
            return "rock"
        return "pop"
    if bpm > 100:
        return "latin"
    if bpm > 85:
        return "pop"
    if bpm > 70:
        return "ballad"
    return "ballad"


def resolve_target_bpm(song_a_bpm, song_b_bpm, style_category):
    """Determine target BPM for the mashup."""
    template = STYLE_TEMPLATES.get(style_category, {})
    bpm_range = template.get('bpm_range', (80, 120))

    # Prefer the style's natural BPM range
    # But if Song A's BPM is close, keep it closer for recognition
    low, high = bpm_range

    if low <= song_a_bpm <= high:
        # Song A already fits style's range
        return song_a_bpm

    # Clamp Song B's BPM to the style range
    target = max(low, min(high, song_b_bpm))

    # If BPM change is extreme (>40%), note it
    return round(target, 1)


# ─── Section-by-section prompt building ─────────────────────────

def build_section_prompts(emotion_sections, style_template, style_category=None):
    """Build per-section arrangement instructions.

    IMPORTANT: 'sparse' means FEWER instruments, not NO instruments.
    Every section must have at least the minimum instrumentation for the style.
    """
    sections = []
    prev_label = None
    for sec in emotion_sections:
        label = sec.get('structural_label', 'section')
        effort = sec.get('vocal_effort', 'low')
        intensity = sec.get('avg_intensity', 0)
        emotions = sec.get('emotion_classification', [])
        trend = sec.get('pitch_trend', 'steady')

        # Determine arrangement density for this section
        if effort == 'high' or intensity > 0.1:
            density = "full"
            dynamics = "loud, powerful"
        elif effort == 'medium' or intensity > 0.04:
            density = "building" if trend == 'rising' else "moderate"
            dynamics = "medium intensity"
        else:
            density = "sparse"
            dynamics = "quiet, intimate"

        # Build the instruction — ALWAYS include instrumentation
        instruments = style_template.get('instruments', 'balanced arrangement')
        inst_list = [i.strip() for i in instruments.split(',')]
        # Minimum instrumentation: at least 2 core instruments always present
        min_instruments = ', '.join(inst_list[:2]) if len(inst_list) >= 2 else instruments
        full_instruments = instruments

        if density == "full":
            arrangement_desc = f"full arrangement with {full_instruments}"
        elif density == "moderate" or density == "building":
            arrangement_desc = f"moderate arrangement — {', '.join(inst_list[:3])} active"
        else:
            # CRITICAL: sparse still means instruments are playing,
            # just fewer of them — NOT silence or a cappella
            arrangement_desc = f"reduced arrangement — {min_instruments} only, still fully played"

        instruction = f"{label.upper()}: {dynamics} — {arrangement_desc}"

        # Add trend-specific note
        if trend == 'rising':
            instruction += ", building tension"
        elif trend == 'falling':
            instruction += ", releasing tension"

        # Add emotion-specific color
        if 'desperate' in emotions:
            instruction += ", raw emotional delivery"
        elif 'passionate' in emotions:
            instruction += ", heartfelt passionate vocals"
        elif 'calm' in emotions:
            instruction += ", gentle and restrained"

        # Add rhythm feel from tempogram analysis
        rhythm_feel = sec.get('rhythm_feel', '')
        if rhythm_feel == 'swing':
            instruction += ", swing feel"
        elif rhythm_feel == 'straight':
            instruction += ", straight rhythm"

        # Add vocal register if detected
        vocal_register = sec.get('vocal_register', '')
        if vocal_register and vocal_register not in ('unknown', 'mixed'):
            if vocal_register == 'falsetto':
                instruction += ", airy falsetto delivery"
            elif vocal_register == 'head_voice':
                instruction += ", light head voice"
            elif vocal_register == 'chest':
                instruction += ", full chest voice power"

        # Detect and add pause instruction at section transitions
        # Use [Break] and [Build Up] structure tags in MiniMax for better results
        if prev_label and prev_label != label:
            prev_sec = sections[-1] if sections else None
            if prev_sec:
                prev_density = prev_sec.get('arrangement_density', 'moderate')
                if prev_density in ('full', 'building') and density == 'sparse':
                    instruction += ", insert [Break] tag here for dramatic 1-2 second pause with reverb tail"
                elif density == 'full' and prev_density in ('sparse', 'moderate'):
                    instruction += ", preceded by [Build Up] tag for anticipation"

        sections.append({
            "section_label": label,
            "start_seconds": sec.get('start_seconds', 0),
            "end_seconds": sec.get('end_seconds', 0),
            "arrangement_density": density,
            "dynamics": dynamics,
            "instruction": instruction,
            "minimum_instruments": min_instruments,
        })

        prev_label = label

    return sections


def build_arrangement_plan(emotion_data, style_category):
    """Build a layering plan: which instruments enter when."""
    profile = emotion_data.get('emotion_profile', {})
    sections = emotion_data.get('emotion_sections', [])
    curve = profile.get('intensity_curve', {}).get('pattern', 'unknown')
    template = STYLE_TEMPLATES.get(style_category, {})

    instruments = template.get('instruments', 'balanced arrangement')
    inst_list = [i.strip() for i in instruments.split(',')]

    plan = {
        "intro": f"reduced: {inst_list[0] if inst_list else 'piano'} + {inst_list[1] if len(inst_list) > 1 else 'light bass'} only — instruments are present but minimal, NOT silent",
        "verse_1": f"add: {inst_list[2] if len(inst_list) > 2 else 'light percussion'} alongside the intro instruments",
        "pre_chorus": f"add: {inst_list[3] if len(inst_list) > 3 else 'strings'} — building toward full sound, insert [Build Up] tag",
        "chorus": f"FULL arrangement: all instruments active — {instruments}",
        "verse_2": "pull back slightly from chorus density, but keep at least 3 instruments active",
        "bridge": f"contrast: reduce to {inst_list[0] if inst_list else 'piano'} and bass — but still played, not silent. Insert [Break] tag before bridge.",
        "final_chorus": "maximum intensity: all instruments + backing harmonies + doubled parts",
        "outro": "gradual fade with instruments still playing, or dramatic final chord sustained",
        "_note": "CRITICAL: 'sparse' or 'reduced' means FEWER instruments playing, NOT silence or a cappella. Every section has at least 2 instruments active. Use [Break] tags for dramatic pauses, [Build Up] before choruses.",
    }

    # Adjust based on curve
    if curve == 'crescendo':
        plan['intro'] = f"very reduced: single {inst_list[0] if inst_list else 'instrument'} + subtle {inst_list[1] if len(inst_list) > 1 else 'bass'} — quiet but instruments ARE playing"
        plan['outro'] = "powerful sustained ending at peak intensity — all instruments at full volume"
    elif curve == 'decrescendo':
        plan['intro'] = f"full arrangement from the start — all instruments active"
        plan['outro'] = f"fade to {inst_list[0] if inst_list else 'single instrument'} — gentle ending, instruments still audible"

    return plan


# ─── Final prompt builder ───────────────────────────────────────

def build_vocal_speed_prompts(vocal_speed, pitch_bends):
    """Convert vocal speed patterns into specific music generation instructions.

    MiniMax doesn't have a direct 'vocal speed' parameter, but elongation can be
    achieved through:
    1. Fewer syllables per line in lyrics (gives model space to stretch)
    2. Repeated vowels in lyrics text ("yoooou", "I caaan't")
    3. Prompt cues: "restrained", "raw and emotional", "rubato"
    4. Section tags with parenthetical vocal cues: [Chorus] (slow, stretching words)
    5. Energy cue 'intimate' or 'restrained' slows delivery
    """
    if not vocal_speed or not vocal_speed.get('detected', True):
        return {
            "detected": False,
            "prompt_additions": [],
            "lyrics_modifications": [],
            "section_cues": [],
        }

    pattern = vocal_speed.get('pattern', 'steady')
    sections = vocal_speed.get('sections', [])
    avg_sps = vocal_speed.get('average_syllables_per_second', 4.0)

    prompt_additions = []
    lyrics_modifications = []
    section_cues = []

    # Pattern-specific prompt additions
    if pattern == 'decelerating':
        prompt_additions.append(
            "vocal delivery progressively slows with emotional elongation, "
            "final sections sung with stretched syllables and rubato timing"
        )
    elif pattern == 'late_elongation':
        prompt_additions.append(
            "final chorus features emotionally stretched syllables, "
            "vocalist holds and bends notes at phrase endings"
        )
    elif pattern == 'gradual_slowing':
        prompt_additions.append(
            "gradual rubato throughout, vocalist increasingly stretches phrases"
        )
    elif pattern == 'accelerating':
        prompt_additions.append(
            "urgent driving vocal delivery, words come faster as emotion builds"
        )

    # Per-section cues based on speed classification
    for sec in sections:
        label = sec.get('structural_label', 'section')
        speed_class = sec.get('speed_classification', 'normal')
        sps = sec.get('syllables_per_second', avg_sps)
        syllables = sec.get('estimated_syllables', 0)
        duration = sec.get('duration_seconds', 0)
        deviation = sec.get('tempo_deviation', 0)

        if speed_class == 'slowed':
            cue = {
                "section": label,
                "lyrics_instruction": (
                    f"Use fewer syllables per line ({max(4, int(sps * 2))} syllables for "
                    f"~{duration:.0f}s), allowing space for word stretching"
                ),
                "prompt_cue": f"{label}: slow, emotionally stretched delivery, hold last syllable",
                "lyrics_tip": (
                    "Repeat key vowels: 'yoooou', 'mooooore', 'I caaaan't' — "
                    "MiniMax responds to repeated characters by elongating"
                ),
            }
            section_cues.append(cue)

            # Suggest specific lyrics modifications
            lyrics_modifications.append({
                "section": label,
                "instruction": (
                    f"Reduce to ~{max(4, int(sps * 2))} syllables per line, "
                    f"use repeated vowels for elongation effect"
                ),
                "example": "I caaan't goooo on with yoooou",
            })

        elif speed_class == 'accelerated':
            section_cues.append({
                "section": label,
                "prompt_cue": f"{label}: urgent, driving delivery, rapid-fire words",
            })

    # Pitch bend analysis → additional cues
    significant_bends = [b for b in pitch_bends if b.get('pitch_range_hz', 0) > 40]
    if significant_bends:
        prompt_additions.append(
            f"{len(significant_bends)} phrase-ending pitch bends detected — "
            "include vocal slides and melismas at phrase endings"
        )

    return {
        "detected": pattern != 'steady',
        "pattern": pattern,
        "prompt_additions": prompt_additions,
        "lyrics_modifications": lyrics_modifications,
        "section_cues": section_cues,
        "significant_pitch_bends": len(significant_bends),
    }


# ─── Structured lyrics template generator ───────────────────────

def generate_structured_lyrics_template(emotion_sections, vocal_speed_patterns,
                                         silence_gaps, arrangement_plan,
                                         style_category, language="english"):
    """Generate a lyrics skeleton with MiniMax structure tags and vocal delivery hints.

    Produces a template with:
    - [Verse], [Chorus], [Bridge], etc. at section boundaries
    - [Break] tags where natural silence gaps exist in the original
    - [Build Up] tags before chorus/crescendo sections
    - Vocal delivery hints per section (from emotion analysis)
    - Elongation markers for slowed sections
    - Placeholders {like_this} for LLM to fill with actual lyrics

    The output is designed to be filled by the LLM or parody_writer,
    then used directly as the `lyrics` parameter for MiniMax music_generation.
    """
    if not emotion_sections:
        return {
            "template": "[Intro]\n(Instrumental)\n\n[Verse]\n{verse_lines}\n\n[Chorus]\n{chorus_lines}",
            "note": "No emotion sections detected — using generic template",
        }

    template_lines = []
    prev_label = None
    seen_labels = set()
    label_counts = {}

    # Count label occurrences for numbering
    for sec in emotion_sections:
        label = sec.get('structural_label', 'section')
        label_counts[label] = label_counts.get(label, 0) + 1

    for i, sec in enumerate(emotion_sections):
        label = sec.get('structural_label', 'section')
        effort = sec.get('vocal_effort', 'low')
        trend = sec.get('pitch_trend', 'steady')
        start = sec.get('start_seconds', 0)
        end = sec.get('end_seconds', 0)

        # Map structural_label to MiniMax structure tag
        tag_map = {
            'intro': '[Intro]',
            'verse': '[Verse]',
            'pre-chorus': '[Pre Chorus]',
            'pre_chorus': '[Pre Chorus]',
            'chorus': '[Chorus]',
            'bridge': '[Bridge]',
            'outro': '[Outro]',
            'interlude': '[Interlude]',
            'post-chorus': '[Post Chorus]',
            'post_chorus': '[Post Chorus]',
            'solo': '[Solo]',
            'section': '[Verse]' if i > 0 else '[Intro]',
        }
        tag = tag_map.get(label, '[Verse]')

        # Check if a silence gap aligns with this section boundary
        has_silence_before = False
        if silence_gaps and prev_label is not None:
            for gap in silence_gaps:
                gap_start = gap.get('start_seconds', 0)
                gap_dur = gap.get('duration_seconds', 0)
                # If a silence gap starts within 2s of this section start
                if abs(gap_start - start) < 2.0 and gap_dur >= 0.8:
                    has_silence_before = True
                    break

        # Insert [Break] tag before section if natural pause exists
        if has_silence_before and prev_label is not None:
            template_lines.append('')
            template_lines.append('[Break]')
            template_lines.append('(1-2 second dramatic pause — reverb tail, not dead silence)')
            template_lines.append('')

        # Insert [Build Up] before chorus sections
        if tag == '[Chorus]' and prev_label is not None:
            template_lines.append('')
            template_lines.append('[Build Up]')
            template_lines.append('(Tension building)')
            template_lines.append('')

        # Add section tag
        template_lines.append('')
        template_lines.append(tag)

        # Add vocal delivery hint as comment
        delivery_hints = []
        if effort == 'high':
            delivery_hints.append('powerful, passionate delivery')
        elif effort == 'medium':
            delivery_hints.append('moderate intensity, building')
        else:
            delivery_hints.append('gentle, intimate delivery')

        if trend == 'rising':
            delivery_hints.append('vocals rising in pitch')
        elif trend == 'falling':
            delivery_hints.append('vocals releasing, settling')

        # Check vocal speed for this section
        vocal_sections = vocal_speed_patterns.get('sections', []) if vocal_speed_patterns else []
        for vs in vocal_sections:
            vs_label = vs.get('structural_label', '')
            vs_start = vs.get('start_seconds', 0)
            if vs_label == label or abs(vs_start - start) < 3.0:
                speed_class = vs.get('speed_classification', 'normal')
                if speed_class == 'slowed':
                    delivery_hints.append('SLOW — stretch syllables, hold last word: "yoooou", "mooooore"')
                elif speed_class == 'accelerated':
                    delivery_hints.append('urgent, rapid-fire delivery')
                break

        if delivery_hints:
            template_lines.append('(' + ', '.join(delivery_hints) + ')')

        # Add placeholder for lyrics
        if label in ('intro', 'outro'):
            template_lines.append('{instrumental_or_adlibs}')
        else:
            template_lines.append('{lyrics_here}')

        prev_label = label

    # Ensure we have [Break] before final chorus if there's a bridge
    template_text = '\n'.join(template_lines)

    return {
        "template": template_text,
        "sections_count": len(emotion_sections),
        "silence_gaps_used": len([g for g in (silence_gaps or []) if g.get('duration_seconds', 0) >= 0.8]),
        "vocal_delivery_hints": True,
        "note": ("Fill {lyrics_here} placeholders with actual lyrics. "
                  "Keep vocal delivery hints in parentheses for LLM reference — "
                  "they will be stripped before sending to MiniMax API. "
                  "Structure tags ([Verse], [Chorus], [Break], [Build Up]) "
                  "MUST be preserved in final lyrics sent to MiniMax."),
    }


# ─── Cover workflow recommendation ──────────────────────────────

def recommend_workflow(emotion_data, style_data, has_song_a_audio=False,
                       has_song_b_audio=False):
    """Recommend which MiniMax workflow to use based on available inputs.

    MiniMax provides several workflows:
    - music-2.6 (standard): Best for creative reimagining with custom lyrics
    - music-cover (one-step): Quick style transfer, preserves melody from audio
    - music-cover (two-step): Style transfer with modified lyrics + melody preservation
    - lyrics_generation: Generate lyrics first, then use with music-2.6

    The cover workflow is CRITICAL for melody preservation — it uses the original
    audio's melodic features to guide generation. Without it, the generated song
    will have a new melody that may not be recognizable.

    Returns:
        dict with:
        - workflow: 'cover_two_step' | 'cover_one_step' | 'standard' | 'standard_with_lyrics_gen'
        - reasoning: Why this workflow is recommended
        - steps: Ordered list of steps to execute
        - model: Which MiniMax model to use
    """
    has_lyrics = bool(style_data.get('lyrics') or style_data.get('formatted_lyrics'))
    # Also accept the orchestrator's detected_lyrics format (raw_transcript / tagged_lyrics)
    if not has_lyrics and isinstance(style_data.get('detected_lyrics'), dict):
        has_lyrics = bool(style_data['detected_lyrics'].get('tagged_lyrics') or
                          style_data['detected_lyrics'].get('raw_transcript'))
    song_name = style_data.get('song_name', style_data.get('name_b', 'unknown'))

    # Decision logic
    if has_song_a_audio and has_lyrics:
        # BEST: We have audio + lyrics → two-step cover for melody preservation
        return {
            "workflow": "cover_two_step",
            "model": "music-cover",
            "reasoning": (
                "Original audio available AND lyrics available → Two-step cover workflow. "
                "music_cover_preprocess extracts melody features, then music_generation "
                "with cover_feature_id preserves the melody while applying new style. "
                "This is the BEST option for melody recognition."
            ),
            "steps": [
                "1. Call POST /v1/music_cover_preprocess with audio_url of Song A",
                "2. Receive cover_feature_id + formatted_lyrics (editable)",
                "3. Edit formatted_lyrics if needed (translate, add structure tags, add [Break]/[Build Up])",
                "4. Call POST /v1/music_generation with model='music-cover', cover_feature_id, edited lyrics, style prompt",
                "5. Download result URL within 24 hours",
            ],
            "cover_feature_id_valid": "24 hours",
            "note": "cover_feature_id is free to generate. Lyrics MUST be 10-1000 chars with structure tags.",
        }

    elif has_song_a_audio and not has_lyrics:
        # Good: Audio but no custom lyrics → one-step cover
        return {
            "workflow": "cover_one_step",
            "model": "music-cover",
            "reasoning": (
                "Original audio available but no custom lyrics → One-step cover. "
                "MiniMax will extract lyrics from audio via ASR and transform style. "
                "Less control over lyrics but preserves melody automatically."
            ),
            "steps": [
                "1. Call POST /v1/music_generation with model='music-cover', audio_url, prompt",
                "2. MiniMax extracts lyrics from audio (ASR) and transforms style",
                "3. Download result URL within 24 hours",
            ],
            "note": "ASR-extracted lyrics may be inaccurate. For better control, use two-step workflow.",
        }

    elif not has_song_a_audio and has_lyrics:
        # Standard generation with custom lyrics
        return {
            "workflow": "standard",
            "model": "music-2.6",
            "reasoning": (
                "No original audio available but have lyrics → Standard generation. "
                "MiniMax will create a new melody. Song recognition depends on "
                "lyrics accuracy and prompt quality. Use production sheet formula "
                "for best results."
            ),
            "steps": [
                "1. Prepare structured lyrics with [Verse], [Chorus], [Break], [Build Up] tags",
                "2. Build production sheet prompt (genre, mood, voice, instruments, BPM, key, structure, production, avoid)",
                "3. Call POST /v1/music_generation with model='music-2.6', prompt, lyrics",
                "4. Download result URL within 24 hours",
            ],
            "note": "Without original audio, melody will be AI-generated. Recognition depends on lyrics and prompt quality.",
        }

    else:
        # No audio, no lyrics → generate everything
        return {
            "workflow": "standard_with_lyrics_gen",
            "model": "music-2.6",
            "reasoning": (
                "No original audio AND no lyrics → Use lyrics_optimizer to auto-generate lyrics "
                "from prompt, then generate music. Least control but fastest."
            ),
            "steps": [
                "1. (Optional) Call POST /v1/lyrics_generation with mode='write_full_song' to get structured lyrics",
                "2. Build production sheet prompt",
                "3. Call POST /v1/music_generation with model='music-2.6', prompt, lyrics_optimizer=true",
                "4. Download result URL within 24 hours",
            ],
            "note": "Consider using lyrics_generation first for better structure control, then pass those lyrics to music-2.6.",
        }


def build_final_prompt(emotion_data, style_data, style_category, target_bpm,
                       duration_seconds=180, language="english", silence_gaps=None,
                       compatibility_notes=None, image_data=None, video_data=None,
                       mashup_plan=None):
    """Build the complete music generation prompt."""
    profile = emotion_data.get('emotion_profile', {})
    hints = emotion_data.get('music_generation_hints', [])
    template = STYLE_TEMPLATES.get(style_category, {})

    # Key from style data or emotion data
    key = style_data.get('estimated_key', 'C major')

    # Build prompt components
    # Build prompt using the Production Sheet formula:
    #   [Genre/subgenre], [mood], [voice type], [instruments — name ALL],
    #   [BPM] BPM in [key], [structure], [production/mix], [things to avoid]

    instruments = template.get('instruments', 'balanced arrangement')
    mood = template.get('mood', 'expressive')
    vocal = template.get('vocal', 'clear emotive vocals')
    era = template.get('era', 'modern')
    avoid = template.get('avoid', 'avoid sparse minimal arrangements, avoid a cappella sections')

    # Duration string
    mins = int(duration_seconds // 60)
    secs = int(duration_seconds % 60)
    duration_str = f"{mins}:{secs:02d}"

    # Start building production sheet prompt
    parts = [
        # 1. Genre + era
        f"{style_category.replace('_', ' ')} style, {era} atmosphere",
        # 2. Mood
        f"{mood} mood",
        # 3. Voice
        f"{vocal}",
        # 4. Instruments — CRITICAL: always listed, always playing
        f"FULL ARRANGEMENT: {instruments} — all instruments always playing throughout, never drop to a cappella or silence",
    ]

    # New analysis: chord progression (autochord)
    chords = emotion_data.get('chord_progression', {})
    if chords.get('detected') and chords.get('progression_string'):
        parts.append(f"chord progression: {chords['progression_string']}")

    # Phase 3: beat tracking (beat_this)
    beats = style_data.get('beat_tracking', {})
    if isinstance(beats, dict) and beats.get('bpm_estimated') and beats.get('bpm_confidence', 0) > 0.5:
        time_sig = beats.get('time_signature_estimate', 4)
        bpm = beats['bpm_estimated']
        conf = beats['bpm_confidence']
        # Override target BPM with the more accurate beat_this value if confidence is high
        if conf > 0.8 and 60 < bpm < 200:
            target_bpm = bpm
        parts.append(f"beat grid: {time_sig}/4 at {bpm:.0f} BPM (confidence {conf:.2f})")

    # Phase 3: Basic Pitch melody analysis
    melody = style_data.get('melody_analysis', {})
    if isinstance(melody, dict) and melody.get('key_estimate_from_midi'):
        # Use the MIDI-confirmed key as a stronger signal than librosa's K-S estimate
        key_from_midi = melody['key_estimate_from_midi']
        key = key_from_midi
        interval_pattern = melody.get('interval_pattern', '')
        scale_modes = melody.get('scale_modes', [])
        bits = [f"melodic key from MIDI: {key_from_midi}"]
        if interval_pattern and interval_pattern != 'unknown':
            bits.append(f"interval motion: {interval_pattern.replace('_', ' ')}")
        if scale_modes:
            bits.append(f"modal character: {', '.join(scale_modes[:2])}")
        parts.append("; ".join(bits))

    # BPM + key should reflect the most accurate signals available.
    parts.append(f"tempo {target_bpm:.0f} BPM in {key}")

    # Phase 3: AST classification (top instruments / genres)
    ast = style_data.get('ast_classification', {})
    if isinstance(ast, dict) and ast.get('top_instruments'):
        top = ast['top_instruments'][:5]
        inst_str = ", ".join(
            f"{t['label'].lower()} ({t['score']:.2f})" for t in top if t['score'] > 0.1
        )
        if inst_str:
            parts.append(f"AST-detected sound palette: {inst_str}")

    # New analysis: loudness dynamics (LUFS / LRA)
    loudness = emotion_data.get('loudness_profile', {})
    if loudness.get('integrated_lufs') is not None:
        dynamics_class = loudness.get('dynamics_classification', '')
        if dynamics_class == 'wide_dynamic_range':
            parts.append("wide dynamic range with natural compression — quiet passages contrasted with loud climaxes")
        elif dynamics_class == 'compressed_consistent':
            parts.append("heavily compressed wall-of-sound production — consistent energy throughout")

    # New analysis: harmonic/percussive balance (HPSS)
    hpss = emotion_data.get('harmonic_percussive', {})
    hpss_class = hpss.get('classification', '')
    if hpss_class == 'smooth_melodic':
        parts.append("smooth melodic texture with minimal percussive harshness")
    elif hpss_class == 'percussive_rhythmic':
        parts.append("driving rhythmic texture with prominent percussive elements")

    # New analysis: vocal quality (parselmouth)
    vocal_quality = emotion_data.get('vocal_quality', {})
    vq = vocal_quality.get('voice_quality', '')
    if vq == 'smooth_clean':
        parts.append("clean polished vocal production")
    elif vq == 'rough_pressed':
        parts.append("raw gritty vocal delivery with pressed quality")
    elif vq == 'slightly_rough':
        parts.append("slightly rough edgy vocal character")

    # New analysis: CLAP classification (append to template, don't replace)
    clap = style_data.get('clap_classification', {})
    if clap.get('detected'):
        top_moods = clap.get('top_moods', [])
        if top_moods:
            # APPEND detected moods to template mood, don't replace
            detected_mood_str = ", ".join([m[0] for m in top_moods[:3]])
            parts.insert(2, f"detected mood from audio: {detected_mood_str}")
        top_instruments = clap.get('top_instruments', [])
        if top_instruments:
            # APPEND detected instruments to template, don't replace
            detected_inst_str = ", ".join([i[0] for i in top_instruments[:5]])
            parts.insert(4, f"audio also features: {detected_inst_str}")

    # Aggregate detected emotions across all sections
    all_emotion_sections = emotion_data.get('emotion_sections', [])
    detected_emotions = []
    for sec in all_emotion_sections:
        for e in sec.get('emotion_classification', []):
            detected_emotions.append(e)
    if detected_emotions:
        from collections import Counter
        emotion_counts = Counter(detected_emotions)
        top_emotions = [e for e, _ in emotion_counts.most_common(3)]
        emotion_str = ", ".join(top_emotions)
        parts.append(f"emotion signature from analysis: {emotion_str}")

    # Aggregate vocal characteristics across sections
    section_registers = [s.get('vocal_register', 'unknown') for s in all_emotion_sections if s.get('vocal_register') and s.get('vocal_register') not in ('unknown', 'mixed')]
    section_harmonies = [s.get('harmony_quality', 'unknown') for s in all_emotion_sections if s.get('harmony_quality') and s.get('harmony_quality') != 'unknown']
    section_rhythms = [s.get('rhythm_feel', 'unknown') for s in all_emotion_sections if s.get('rhythm_feel') and s.get('rhythm_feel') not in ('unknown', 'mixed')]

    if section_registers:
        from collections import Counter
        dominant_register = Counter(section_registers).most_common(1)[0][0]
        register_map = {
            'chest': 'full chest voice power throughout',
            'head': 'light head voice mix throughout',
            'falsetto': 'airy falsetto delivery throughout',
        }
        if dominant_register in register_map:
            parts.append(f"vocal character: {register_map[dominant_register]}")

    if section_harmonies:
        from collections import Counter
        dominant_harmony = Counter(section_harmonies).most_common(1)[0][0]
        harmony_map = {
            'consonant': 'consonant, smooth harmonic content',
            'tense': 'tense, dissonant harmonic character',
            'rich': 'rich harmonic movement and color',
        }
        if dominant_harmony in harmony_map:
            parts.append(f"harmonic character: {harmony_map[dominant_harmony]}")

    if section_rhythms:
        from collections import Counter
        dominant_rhythm = Counter(section_rhythms).most_common(1)[0][0]
        rhythm_map = {
            'swing': 'swing feel, laid-back groove',
            'straight': 'straight eighth-note rhythm',
        }
        if dominant_rhythm in rhythm_map:
            parts.append(f"rhythm: {rhythm_map[dominant_rhythm]}")

    # ── Tonal character (from analyze_audio.brightness + energy_description) ──
    brightness = style_data.get('brightness')
    energy_desc = style_data.get('energy_description')
    brightness_text = str(brightness).lower() if brightness is not None else ''
    if brightness_text and brightness_text != 'balanced':
        bright_map = {
            'bright': 'bright treble, crisp presence',
            'dark': 'dark warm tone, rolled-off highs',
        }
        if any(token in brightness_text for token in ('bright', 'treble')):
            parts.append(f"tonal character: {bright_map['bright']}")
        elif any(token in brightness_text for token in ('dark', 'warm', 'mellow', 'bass')):
            parts.append(f"tonal character: {bright_map['dark']}")
    if energy_desc and energy_desc not in ('moderate', 'medium'):
        # Use the energy description as-is (already human-readable)
        parts.append(f"energy profile: {energy_desc}")

    # ── Rhythm metrics (from analyze_audio.tempo_consistency, onset_density) ──
    tempo_consistency = style_data.get('tempo_consistency')
    onset_density = style_data.get('onset_density')
    if tempo_consistency is not None:
        if tempo_consistency > 0.85:
            parts.append("rhythm: tight, on-beat delivery")
        elif tempo_consistency < 0.5:
            parts.append("rhythm: loose, with tempo drift")
    if onset_density is not None:
        if onset_density > 5:
            parts.append("high note density — busy, intricate")
        elif onset_density < 1.5:
            parts.append("low note density — spacious, with room")

    # ── Instrument hints (from analyze_audio / analyze_two_songs) ──
    instrument_hints = style_data.get('instrument_hints', {})
    if instrument_hints:
        hint_map = {
            'likely_acoustic': 'acoustic-leaning timbres',
            'likely_electronic': 'electronic / synthetic textures',
            'likely_orchestral': 'orchestral / cinematic textures',
            'likely_distorted': 'distorted / overdriven tones',
        }
        matched = [hint_map[k] for k in hint_map if instrument_hints.get(k)]
        if matched:
            parts.append(f"instruments detected: {', '.join(matched)}")

    # Per-section vocal texture aggregation (jitter, shimmer, HNR)
    # These were computed per section in analyze_vocal_emotion but only the global
    # voice_quality was used. Now we surface per-section variation.
    hnrs = [s.get('hnr_db') for s in all_emotion_sections if s.get('hnr_db') is not None]
    jitters = [s.get('jitter_pct') for s in all_emotion_sections if s.get('jitter_pct') is not None]
    shimmers = [s.get('shimmer_pct') for s in all_emotion_sections if s.get('shimmer_pct') is not None]
    if hnrs or jitters or shimmers:
        # Identify sections that stand out: breathier than average (low HNR) or more raw (high jitter)
        avg_hnr = sum(hnrs) / len(hnrs) if hnrs else 0
        breathy_sections = [s for s in all_emotion_sections
                            if s.get('hnr_db') is not None and s.get('hnr_db', 99) < avg_hnr - 5]
        if breathy_sections:
            labels = sorted({s.get('structural_label', 'section') for s in breathy_sections})[:3]
            parts.append(
                f"vocal texture in {'/'.join(labels)}: breathier / more intimate than average"
            )
        high_jitter_sections = [s for s in all_emotion_sections
                                if s.get('jitter_pct', 0) > 1.0]
        if high_jitter_sections:
            labels = sorted({s.get('structural_label', 'section') for s in high_jitter_sections})[:3]
            parts.append(
                f"vocal texture in {'/'.join(labels)}: raw / strained delivery"
            )

    # Add emotion-specific dynamics
    curve = profile.get('intensity_curve', {}).get('pattern', '')
    dynamic_range = profile.get('dynamic_range', 0)

    # Vocal speed / elongation
    vocal_speed = emotion_data.get('vocal_speed_patterns', {})
    speed_pattern = vocal_speed.get('pattern', 'steady')

    if speed_pattern == 'decelerating':
        parts.append("vocal delivery progressively slows with emotional syllable stretching toward the end")
    elif speed_pattern == 'late_elongation':
        parts.append("final sections feature emotionally stretched syllables, vocalist holds and bends notes")
    elif speed_pattern == 'gradual_slowing':
        parts.append("gradual rubato throughout, vocalist increasingly stretches phrases")

    # Pitch bends
    pitch_bends = emotion_data.get('pitch_bends', [])
    significant_bends = [b for b in pitch_bends if b.get('pitch_range_hz', 0) > 40]
    if len(significant_bends) > 3:
        parts.append("expressive pitch bends and vocal slides at phrase endings")

    # Intensity curve → arrangement build strategy
    # IMPORTANT: 'sparse' sections still have instruments — just fewer
    inst_list = [i.strip() for i in instruments.split(',')]
    min_inst = ', '.join(inst_list[:2]) if len(inst_list) >= 2 else instruments

    if curve == 'crescendo':
        parts.append(f"starts with reduced arrangement ({min_inst} only), progressively adds layers, builds to full orchestration at climax")
    elif curve == 'decrescendo':
        parts.append(f"opens with full arrangement, gradually reduces to {min_inst} for intimate ending")
    elif curve == 'wave':
        parts.append(f"dynamic contrast between sections — fuller for peaks, reduced to {min_inst} for valleys, but ALWAYS with instrumental backing")
    elif curve == 'climax_late':
        parts.append(f"restrained through early sections (fewer instruments but never silent), explosive climax in final third with full arrangement")

    if dynamic_range > 0.08:
        parts.append("wide dynamic range — quiet passages contrasted with loud climaxes, include 1-2 second dramatic pauses between major sections")

    # Use detected silence gaps for precise pause positioning
    if silence_gaps:
        significant_gaps = [g for g in silence_gaps if g.get('duration_seconds', 0) >= 0.8]
        if significant_gaps:
            gap_positions = [f"{g['start_seconds']:.0f}s ({g['duration_seconds']:.1f}s pause)" for g in significant_gaps[:4]]
            parts.append(f"natural dramatic pauses detected at: {', '.join(gap_positions)} — preserve these with [Break] tags in lyrics")

    # Repetitive intensification
    rep = profile.get('repetitive_intensification', {})
    if rep.get('detected'):
        parts.append("for repeated phrases: each repetition grows more intense and powerful")

    # Emotional shifts
    shifts = profile.get('emotional_shifts', [])
    sudden = [s for s in shifts if s.get('type') == 'sudden']
    if sudden:
        # Provide specific pause timing for the top sudden shifts
        pause_positions = []
        for s in sudden[:3]:
            pos = int(s['at_seconds'])
            pause_positions.append(f"{pos}s")
        parts.append(f"include dramatic 1-2 second pauses before sudden intensity shifts at approximately: {', '.join(pause_positions)}")
        parts.append("silence gaps between sections should have brief instrumental sustain or reverb tail — not abrupt silence")

    # Language
    if language and language.lower() not in ('english', 'en'):
        lang_names = {
            'fr': 'French', 'french': 'French',
            'es': 'Spanish', 'spanish': 'Spanish',
            'de': 'German', 'german': 'German',
            'it': 'Italian', 'italian': 'Italian',
            'pt': 'Portuguese', 'portuguese': 'Portuguese',
        }
        lang = lang_names.get(language.lower(), language)
        parts.append(f"{lang} lyrics")

    # Quality + duration
    parts.append("studio recording quality")

    # Image-driven style cues (album art / image input)
    if image_data and not image_data.get('error'):
        image_bits = []
        if image_data.get('mood') and image_data['mood'] != 'unknown':
            image_bits.append(f"visual mood: {image_data['mood']}")
        for hint in image_data.get('style_hints', [])[:3]:
            if hint:
                image_bits.append(f"art suggests: {hint}")
        for hint in image_data.get('production_hints', [])[:2]:
            if hint:
                image_bits.append(f"production cue: {hint}")
        for hint in image_data.get('era_hints', [])[:2]:
            if hint:
                image_bits.append(f"era cue: {hint}")
        if image_bits:
            parts.append("; ".join(image_bits))

    # Video-driven style cues (music video / video input)
    if video_data and not video_data.get('error'):
        video_bits = []
        cp = video_data.get('color_palette', {})
        if isinstance(cp, dict) and cp.get('dominant_mood'):
            video_bits.append(f"video palette mood: {cp['dominant_mood']}")
        if video_data.get('vocal_emotion') and not video_data['vocal_emotion'].get('error'):
            ve = video_data['vocal_emotion']
            if ve.get('emotion_profile', {}).get('overall_emotion_type'):
                video_bits.append(f"video vocal emotion: {ve['emotion_profile']['overall_emotion_type']}")
        top_genres = video_data.get('audio_features', {}).get('clap_classification', {}).get('top_genres')
        if top_genres:
            genre_labels = []
            for genre in top_genres[:2]:
                if isinstance(genre, (list, tuple)) and genre:
                    genre_labels.append(str(genre[0]))
                elif isinstance(genre, dict) and genre.get('label'):
                    genre_labels.append(str(genre['label']))
                else:
                    genre_labels.append(str(genre))
            if genre_labels:
                video_bits.append(f"video audio: {', '.join(genre_labels)}")
        if video_data.get('motion_arc'):
            avg_motion = sum(video_data['motion_arc']) / max(len(video_data['motion_arc']), 1)
            motion_descriptor = "high-motion" if avg_motion > 0.4 else "low-motion" if avg_motion < 0.1 else "moderate-motion"
            video_bits.append(f"video energy: {motion_descriptor}")
        if video_bits:
            parts.append("; ".join(video_bits))

    # Mashup-specific cues from analyze_two_songs.generate_mashup_recommendations
    if mashup_plan:
        style_notes = mashup_plan.get('style_notes')
        if style_notes:
            parts.append(f"style direction: {style_notes}")
        inst_additions = mashup_plan.get('instrument_prompt_additions')
        if inst_additions:
            parts.append(inst_additions)

    # Music generation hints (computed by analyze_vocal_emotion.generate_music_hints).
    # These cover anti-sparse-silence guard, repetitive intensification, sudden shifts,
    # and overall mood keywords. Inject as a single trailing sentence so they reach
    # the model rather than being silently dropped.
    if hints:
        # De-duplicate with parts already added
        unique_hints = []
        existing = " ".join(parts).lower()
        for h in hints:
            if h and h.lower()[:40] not in existing:
                unique_hints.append(h)
        if unique_hints:
            parts.append("; ".join(unique_hints[:4]))

    # Avoid list — from template
    parts.append(avoid)

    # Mashup compatibility notes (from _key_compat.py)
    if compatibility_notes:
        notes_text = "; ".join(compatibility_notes[:3])
        parts.append(f"mashup compatibility: {notes_text}")

    final_prompt = ",\n".join(parts)
    return final_prompt


# ─── Main ───────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description='Convert emotion + style analysis to music generation prompt')
    parser.add_argument('--emotion', required=True, help='Emotion analysis JSON (from analyze_vocal_emotion.py)')
    parser.add_argument('--style', required=True, help='Style analysis JSON (from analyze_audio.py or analyze_two_songs.py)')
    parser.add_argument('--orchestrator-output', help='Full orchestrator JSON (analysis_orchestrator.py output) — '
                        'enables image/video/mashup emotion consumption')
    parser.add_argument('--style-category', choices=list(STYLE_TEMPLATES.keys()),
                        help='Force style category (auto-detected if omitted)')
    parser.add_argument('--target-duration', type=int, default=180, help='Target duration in seconds (default 180)')
    parser.add_argument('--language', default='english', help='Lyrics language')
    parser.add_argument('--song-a-bpm', type=float, help='Override Song A BPM')
    parser.add_argument('--output', help='Output JSON file path')
    args = parser.parse_args()

    # Load inputs
    with open(args.emotion) as f:
        emotion_data = json.load(f)

    # Style might be from analyze_audio.py (single song) or analyze_two_songs.py (two songs)
    with open(args.style) as f:
        raw_style = json.load(f)

    # Optional: full orchestrator output for image/video/mashup emotion
    orchestrator_data = None
    if args.orchestrator_output:
        with open(args.orchestrator_output) as f:
            orchestrator_data = json.load(f)

    # Normalise: extract Song B style from either format
    if 'song_b_reference' in raw_style:
        style_data = raw_style['song_b_reference']
        song_a_data = raw_style.get('song_a_original', {})
        mashup_plan = raw_style.get('mashup_plan', {})
    elif 'mashup_plan' in raw_style:
        style_data = raw_style.get('song_b_reference', raw_style)
        song_a_data = raw_style.get('song_a_original', {})
        mashup_plan = raw_style.get('mashup_plan', {})
    else:
        # Single song analysis (analyze_audio.py format)
        style_data = raw_style
        song_a_data = {}
        mashup_plan = {}

    # If orchestrator output is available, prefer mashup_vocal_emotion over plain emotion_data
    if orchestrator_data and orchestrator_data.get('mashup', {}).get('song_a_vocal_emotion'):
        emotion_data = orchestrator_data['mashup']['song_a_vocal_emotion']

    # Pull detected lyrics and advanced analysis from the orchestrator output
    # and inject into style_data so the prompt + workflow recommendation see them.
    if orchestrator_data:
        audio_part = orchestrator_data.get('audio') or {}
        mashup_part = orchestrator_data.get('mashup') or {}

        # 1) Detected lyrics → workflow recommendation sees them
        _dl = audio_part.get('lyrics') or mashup_part.get('song_a_lyrics')
        if _dl and isinstance(_dl, dict):
            style_data = dict(style_data)
            style_data['detected_lyrics'] = _dl

        # 2) Beat tracking (beat_this) → BPM confidence + time sig
        beats = audio_part.get('beats') or {}
        if isinstance(beats, dict) and not beats.get('error'):
            style_data = dict(style_data)
            style_data['beat_tracking'] = beats

        # 3) Basic Pitch melody analysis → MIDI-confirmed key + scale modes
        melody = audio_part.get('melody') or {}
        if isinstance(melody, dict) and not melody.get('error'):
            style_data = dict(style_data)
            style_data['melody_analysis'] = melody

        # 4) AST classification (instruments / genres)
        ast = audio_part.get('ast_classification') or {}
        if isinstance(ast, dict) and not ast.get('error'):
            style_data = dict(style_data)
            style_data['ast_classification'] = ast

    # Determine style category
    if args.style_category:
        category = args.style_category
    else:
        category = infer_style_category(style_data)

    # Target BPM
    song_a_bpm = args.song_a_bpm or song_a_data.get('bpm', style_data.get('bpm', 100))
    song_b_bpm = style_data.get('bpm', 100)
    target_bpm = mashup_plan.get('target_bpm') or resolve_target_bpm(song_a_bpm, song_b_bpm, category)

    # Compatibility notes from _key_compat (if available)
    compat = mashup_plan.get('compatibility', {})
    compatibility_notes = compat.get('notes', [])

    # Build outputs
    template = STYLE_TEMPLATES.get(category, {})

    # If allin1 song structure is available, upgrade emotion section labels
    # by matching allin1 segment labels to emotion sections via time overlap.
    song_structure = emotion_data.get('song_structure', {})
    if song_structure.get('detected') and song_structure.get('segments'):
        struct_segments = song_structure['segments']
        emotion_sections = emotion_data.get('emotion_sections', [])
        for esec in emotion_sections:
            es_start = esec.get('start_seconds', 0)
            es_end = esec.get('end_seconds', 0)
            es_mid = (es_start + es_end) / 2
            for sseg in struct_segments:
                ss_start = sseg.get('start_seconds', 0)
                ss_end = sseg.get('end_seconds', 0)
                if ss_start <= es_mid <= ss_end:
                    allin1_label = sseg.get('label', '')
                    if allin1_label:
                        esec['structural_label'] = allin1_label
                    break

    section_prompts = build_section_prompts(emotion_data.get('emotion_sections', []), template)
    arrangement_plan = build_arrangement_plan(emotion_data, category)

    # Vocal speed patterns → prompts and lyrics modifications
    vocal_speed_data = emotion_data.get('vocal_speed_patterns', {})
    pitch_bends_data = emotion_data.get('pitch_bends', [])
    vocal_speed_prompts = build_vocal_speed_prompts(vocal_speed_data, pitch_bends_data)

    final_prompt = build_final_prompt(
        emotion_data, style_data, category, target_bpm,
        duration_seconds=args.target_duration, language=args.language,
        silence_gaps=emotion_data.get('silence_gaps', []),
        compatibility_notes=compatibility_notes,
        image_data=(orchestrator_data or {}).get('image'),
        video_data=(orchestrator_data or {}).get('video'),
        mashup_plan=mashup_plan or None,
    )

    # Generate structured lyrics template with [Break] and [Build Up] tags
    structured_lyrics = generate_structured_lyrics_template(
        emotion_sections=emotion_data.get('emotion_sections', []),
        vocal_speed_patterns=vocal_speed_data,
        silence_gaps=emotion_data.get('silence_gaps', []),
        arrangement_plan=arrangement_plan,
        style_category=category,
        language=args.language,
    )

    # Recommend MiniMax workflow (cover vs standard generation)
    has_audio = bool(emotion_data.get('audio_info', {}).get('file'))
    # Song B has audio when its source_file points to a real file (not LLM-knowledge-only).
    # In analyze_two_songs.py, song_b_reference.source_file is set when an actual audio path was provided.
    has_song_b_audio = bool(style_data.get('source_file')) and os.path.isfile(style_data.get('source_file', ''))
    workflow_rec = recommend_workflow(
        emotion_data=emotion_data,
        style_data=style_data,
        has_song_a_audio=has_audio,
        has_song_b_audio=has_song_b_audio,
    )

    # Pull detected lyrics from the orchestrator output (for the result field)
    detected_lyrics = None
    if orchestrator_data:
        audio_part = orchestrator_data.get('audio') or {}
        mashup_part = orchestrator_data.get('mashup') or {}
        detected_lyrics = audio_part.get('lyrics') or mashup_part.get('song_a_lyrics')

    result = {
        "style_category": category,
        "target_bpm": target_bpm,
        "target_duration_seconds": args.target_duration,
        "language": args.language,
        "final_prompt": final_prompt,
        "structured_lyrics_template": structured_lyrics,
        "workflow_recommendation": workflow_rec,
        "section_prompts": section_prompts,
        "arrangement_plan": arrangement_plan,
        "vocal_speed_patterns": vocal_speed_prompts,
        # style_template_used removed in v0.1.1 (always == style_category; no downstream consumer)
        # emotion_hints_used removed in v0.1.1 (hints are now injected into final_prompt directly)
        "detected_lyrics": detected_lyrics,
    }

    json_out = json.dumps(result, indent=2, ensure_ascii=False)

    if args.output:
        with open(args.output, 'w') as f:
            f.write(json_out)
        print(f"Written: {args.output}", file=sys.stderr)
    else:
        print(json_out)


if __name__ == '__main__':
    main()