#!/usr/bin/env python3 """analysis_orchestrator.py — Single entry point for analyzing any input. Takes any combination of inputs and runs the right analysis scripts: - Audio file → analyze_vocal_emotion.py + analyze_audio.py + (optional) extract_lyrics_whisper.py - Two audio files → analyze_two_songs.py + vocal emotion on Song A - Video file → extract_video_features.py + vocal emotion on the extracted audio - Image file → analyze_image.py - YouTube URL → download with the standalone download_youtube.py, then treat as audio Output: a unified JSON containing all the analyses, ready for emotion_to_prompt.py or direct use by the LLM. Usage: # Single audio (with lyrics) python3 analysis_orchestrator.py --audio /tmp/song.wav --lyrics --output /tmp/analysis.json # Two audios (mashup) python3 analysis_orchestrator.py --audio /tmp/a.wav --audio /tmp/b.wav \\ --name-a "Song A" --name-b "Song B" --output /tmp/analysis.json # Video python3 analysis_orchestrator.py --video /tmp/clip.mp4 --output /tmp/analysis.json # Image python3 analysis_orchestrator.py --image /tmp/album_art.jpg --output /tmp/analysis.json # YouTube URL (downloads first, --lyrics optional) python3 analysis_orchestrator.py --youtube "https://youtube.com/watch?v=..." --lyrics --output /tmp/analysis.json # Combination: audio + image (Song A audio, plus album art for style cues) python3 analysis_orchestrator.py --audio /tmp/song.wav --image /tmp/art.jpg \\ --output /tmp/analysis.json """ import sys import os import json import re import argparse import subprocess import tempfile SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, SCRIPT_DIR) from _key_compat import mashup_compatibility _YT_NOISE = [ # Common YouTube filename noise that pollutes song identification. r"\(official\s+(?:music\s+)?video\)", r"\(official\s+audio\)", r"\(official\s+lyric\s+video\)", r"\(official\s+lyrics\)", r"\(lyric\s+video\)", r"\(lyrics?\)", r"\(music\s+video\)", r"\(hd\s+(?:hq\s+)?video\)", r"\[official\s+(?:music\s+)?video\]", r"\[official\s+lyric\s+video\]", r"\[lyric\s+video\]", r"\[lyrics?\]", r"\[hq\]", r"\[hd\]", r"\[4k\]", r"\[1080p\]", r"\(remastered(?:\s+\d{4})?\)", r"\(remaster\)", r"\(live(?:\s+at\s+[^)]+)?\)", r"\(audio\)", r"\(mono\)", r"\(stereo\)", ] _YT_NOISE_RE = re.compile("|".join(_YT_NOISE), re.IGNORECASE) def _strip_youtube_noise(text: str) -> str: """Remove common YouTube parenthetical/bracket noise like '(Official Video)'.""" if not text: return text cleaned = _YT_NOISE_RE.sub("", text) # Collapse leftover whitespace and dangling punctuation cleaned = re.sub(r"\s+", " ", cleaned) cleaned = re.sub(r"\s+([\-\u2013\u2014])\s+", r" \1 ", cleaned) return cleaned.strip(" \t-_\u2013\u2014.") def _parse_song_stem(stem: str) -> tuple: """Parse 'Artist - Title [noise]' into (artist, title) with noise stripped. Returns ("", "") if the stem does not contain a separator. The separator is the FIRST ' - ' (with surrounding spaces) so the artist can contain dashes. The title side is then cleaned of common YouTube suffix noise. """ if not stem: return ("", "") # Take the first " - " only — everything after is the title even # if it contains further " - " (e.g. "Song - Acoustic - Live") if " - " not in stem: return ("", "") artist, title = stem.split(" - ", 1) artist = artist.strip() title = _strip_youtube_noise(title).strip() if not artist or not title: return ("", "") return (artist, title) AUDIO_EXTENSIONS = {'.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac', '.wma', '.opus'} def _normalize_output_base(path): root, ext = os.path.splitext(path) return root if ext.lower() in AUDIO_EXTENSIONS else path def _lazy_import_analyze_audio(): from analyze_audio import analyze as _analyze return _analyze def _lazy_import_analyze_song(): from analyze_two_songs import analyze_song as _analyze_song return _analyze_song def _run_vocal_emotion(audio_path, use_demucs=False): """Run vocal emotion analysis via direct function call (no stdout capture). If use_demucs is True, runs Demucs source separation first to extract a clean vocal stem, then analyzes the stem. This dramatically improves pitch tracking, HNR, and silence detection on busy mixes. Returns the analysis dict, or {'error': '...'} on failure. """ target_path = audio_path demucs_info = None if use_demucs: try: from extract_stems import separate_stems print("Running Demucs source separation (--use-demucs)...", file=sys.stderr) demucs_result = separate_stems( audio_path=audio_path, model_name='htdemucs', target_stems=['vocals'], ) if 'error' not in demucs_result and 'stems' in demucs_result: vocals_path = demucs_result['stems'].get('vocals') if vocals_path and os.path.exists(vocals_path): target_path = vocals_path demucs_info = demucs_result except Exception as e: print(f"Demucs failed, falling back to mix: {e}", file=sys.stderr) try: from analyze_vocal_emotion import analyze_audio import argparse as _ap args = _ap.Namespace( audio=target_path, sections=None, output=None, hop_length=512, ) result = analyze_audio(args) if demucs_info and isinstance(result, dict): result['_demucs'] = { 'model': demucs_info.get('model'), 'vocals_stem': demucs_info.get('stems', {}).get('vocals'), 'cache': demucs_info.get('cache'), } return result except SystemExit: return {'error': 'vocal emotion analysis exited via SystemExit'} except Exception as e: return {'error': f'vocal emotion analysis failed: {e}'} def _run_lyrics_extraction(audio_path, model='base'): """Run Whisper lyrics extraction. Returns dict, or {'note': '...'} on missing dep.""" try: from extract_lyrics_whisper import transcribe return transcribe(audio_path, model_name=model) except Exception as e: return {'error': f'lyrics extraction failed: {e}'} def _run_web_lyrics_lookup(artist, title, whisper_text='', min_match=0.6): """Optionally look up song lyrics from LRCLib. Returns dict, never raises. Returns a dict in the same shape as fetch_lyrics_web.fetch_lyrics_web: { "status": "matched" | "no_web_lyrics" | "low_match" | "unverified" | "network_error", "lyrics": str | None, ... } """ try: from fetch_lyrics_web import fetch_lyrics_web return fetch_lyrics_web( artist=artist, title=title, whisper_transcript=whisper_text, min_match=min_match, ) except Exception as e: return { "status": "network_error", "lyrics": None, "source": "lrclib", "match_score": None, "artist": artist, "title": title, "album": None, "duration": None, "error": f"web lyrics lookup crashed: {e}", } def _run_beat_tracking(audio_path): """Run beat_this beat + downbeat tracking. Returns dict.""" try: from track_beats import track_beats return track_beats(audio_path, device='cpu') except Exception as e: return {'error': f'beat tracking failed: {e}'} def _run_melody_extraction(audio_path, max_seconds=300): """Run Basic Pitch polyphonic AMT. Returns dict.""" try: from extract_melody import extract_melody return extract_melody(audio_path) except Exception as e: return {'error': f'melody extraction failed: {e}'} def _run_mert_embedding(audio_path, max_seconds=120): """Run MERT music embedding. Returns dict.""" try: from compute_audio_embedding import compute_mert_embedding return compute_mert_embedding(audio_path, device='cpu', max_seconds=max_seconds) except Exception as e: return {'error': f'MERT embedding failed: {e}'} def _run_ast_classification(audio_path, top_k=15): """Run AST instrument classification. Returns dict.""" try: from classify_instruments import classify_instruments return classify_instruments(audio_path, top_k=top_k, device='cpu') except Exception as e: return {'error': f'AST classification failed: {e}'} def analyze_audio_file(audio_path, run_emotion=True, run_lyrics=False, lyrics_model='base', use_demucs=False, run_advanced=True, lyrics_source='whisper', lyrics_artist='', lyrics_title='', lyrics_web_min_match=0.6): """Run all audio analyses and return a combined result. If run_advanced is True (default), also runs beat_this, Basic Pitch, MERT, and AST classification. Each is best-effort — failure is captured in the result dict, not raised. lyrics_source controls the lyrics extraction strategy: - "whisper" (default): use Whisper only - "web": use LRCLib only (skipped for instrumentals) - "auto": Whisper first, then optionally fall back to LRCLib - "off": skip lyrics entirely """ result = { 'audio_file': audio_path, 'basic_features': None, 'vocal_emotion': None, 'lyrics': None, 'beats': None, 'melody': None, 'mert_embedding': None, 'ast_classification': None, } try: run_analyze_audio = _lazy_import_analyze_audio() result['basic_features'] = run_analyze_audio(audio_path) except Exception as e: result['basic_features'] = {'error': str(e)} if run_emotion: result['vocal_emotion'] = _run_vocal_emotion(audio_path, use_demucs=use_demucs) if run_lyrics: whisper_result = None web_result = None if lyrics_source in ('whisper', 'auto'): # In auto mode, only the match score against the web lookup # is needed. The 'tiny' model is ~3x faster than 'base' and # produces enough transcript for word-overlap scoring. If the # web lookup fails to match, we re-run with the user's # requested model below. model_for_score = 'tiny' if lyrics_source == 'auto' else lyrics_model whisper_result = _run_lyrics_extraction(audio_path, model=model_for_score) if lyrics_source in ('web', 'auto') and lyrics_artist and lyrics_title: whisper_text = '' if isinstance(whisper_result, dict): whisper_text = whisper_result.get('raw_transcript', '') or '' web_result = _run_web_lyrics_lookup( artist=lyrics_artist, title=lyrics_title, whisper_text=whisper_text, min_match=lyrics_web_min_match, ) # Decide which lyrics to surface if lyrics_source == 'whisper': result['lyrics'] = whisper_result elif lyrics_source == 'web': result['lyrics'] = { 'web_lookup': web_result, 'whisper': whisper_result, # may be None } if web_result else whisper_result elif lyrics_source == 'auto': # Auto: prefer web if it matched; otherwise re-run Whisper # with the user's chosen model (the initial 'tiny' run was # only for scoring). In any case, preserve the web_lookup # info in the result so the caller can see what was tried. if web_result and web_result.get('status') == 'matched': # Web won. Re-run Whisper with the user's model for # fallback only if a user-facing prompt will need it. # Skipping the re-run keeps auto fast. result['lyrics'] = { 'source': 'web', 'web_lookup': web_result, 'whisper': whisper_result, # tiny run, kept for inspection } else: # Web missed. Re-run Whisper with the user's model so # the user gets the quality they asked for. if lyrics_model != 'tiny': whisper_result = _run_lyrics_extraction( audio_path, model=lyrics_model ) result['lyrics'] = { 'source': 'whisper', 'whisper': whisper_result, 'web_lookup': web_result, # may be None or low_match } else: result['lyrics'] = None if run_advanced: result['beats'] = _run_beat_tracking(audio_path) result['melody'] = _run_melody_extraction(audio_path) result['mert_embedding'] = _run_mert_embedding(audio_path) result['ast_classification'] = _run_ast_classification(audio_path) return result def download_youtube(url, output_path, prefer_standalone=True): """Download audio from a YouTube URL. prefer_standalone=True uses the more featureful scripts/download_youtube.py (auto-installs yt-dlp, robust error handling, metadata extraction). Returns the path to the downloaded WAV, or None on failure. """ if prefer_standalone: try: from download_youtube import download_youtube as _standalone return _standalone(url, output_path) except (ImportError, AttributeError): pass try: output_base = _normalize_output_base(output_path) result = subprocess.run( ['yt-dlp', '-x', '--audio-format', 'wav', '-o', f'{output_base}.%(ext)s', url], capture_output=True, text=True, timeout=300, ) if result.returncode == 0: for ext in AUDIO_EXTENSIONS: candidate = f'{output_base}{ext}' if os.path.exists(candidate): return candidate if os.path.exists(output_path): return output_path except (FileNotFoundError, subprocess.TimeoutExpired): pass return None def orchestrate(audios=(), video=None, image=None, youtubes=(), name_a=None, name_b=None, run_lyrics=False, lyrics_model='base', use_demucs=False, run_advanced=True, image_ocr=False, image_faces=False, image_vlm=False, lyrics_source='whisper', lyrics_web_min_match=0.6): """Run all the requested analyses and return a unified result. run_advanced controls beat_this + Basic Pitch + MERT + AST. Default True. image_ocr/image_faces/image_vlm control optional image-side analyses. """ result = { 'inputs': { 'audio_files': list(audios), 'video_files': [video] if video else [], 'image_files': [image] if image else [], 'youtube_urls': list(youtubes), 'name_a': name_a, 'name_b': name_b, }, } youtube_paths = [] youtube_tmpfiles = [] for url in youtubes: tmp_fd, tmp_path = tempfile.mkstemp(suffix='.wav') os.close(tmp_fd) # Reserve a unique name, then remove the placeholder so yt-dlp does not # mistake the empty temp file for an already downloaded destination. os.unlink(tmp_path) youtube_tmpfiles.append(tmp_path) print(f"Downloading YouTube: {url}", file=sys.stderr) downloaded = download_youtube(url, tmp_path) if downloaded and os.path.exists(downloaded): youtube_paths.append(downloaded) result['inputs']['audio_files'].append(downloaded) else: print(f"Failed to download: {url}", file=sys.stderr) try: os.unlink(tmp_path) youtube_tmpfiles.remove(tmp_path) except (OSError, ValueError): pass try: all_audio_paths = list(audios) + youtube_paths if len(all_audio_paths) >= 2: run_analyze_song = _lazy_import_analyze_song() song_a_data = run_analyze_song(all_audio_paths[0], name_a or 'Song A') song_b_data = run_analyze_song(all_audio_paths[1], name_b or 'Song B') compat = mashup_compatibility(song_a_data, song_b_data) mashup_result = { 'song_a': song_a_data, 'song_b': song_b_data, 'compatibility': compat, } if run_lyrics: mashup_result['song_a_lyrics'] = _run_lyrics_extraction( all_audio_paths[0], model=lyrics_model ) song_a_emotion = _run_vocal_emotion(all_audio_paths[0], use_demucs=use_demucs) if song_a_emotion and not song_a_emotion.get('error'): mashup_result['song_a_vocal_emotion'] = song_a_emotion result['mashup'] = mashup_result elif len(all_audio_paths) == 1: # For the single-audio case, optionally try to identify the song # for web lyrics lookup. Use the first non-empty of (name-a, audio path stem). stem = os.path.splitext(os.path.basename(all_audio_paths[0]))[0] if not stem or stem.startswith("audio") or stem.startswith("input"): stem = "" song_artist, song_title = _parse_song_stem(stem) if stem else ("", "") if (not song_artist or not song_title) and name_a: song_artist, song_title = _parse_song_stem(name_a) result['audio'] = analyze_audio_file( all_audio_paths[0], run_emotion=True, run_lyrics=run_lyrics, lyrics_model=lyrics_model, use_demucs=use_demucs, run_advanced=run_advanced, lyrics_source=lyrics_source, lyrics_artist=song_artist, lyrics_title=song_title, lyrics_web_min_match=lyrics_web_min_match, ) else: result['audio'] = None # type: ignore[assignment] if video: try: from extract_video_features import analyze_video video_result = analyze_video(video) video_audio_path = None try: tmp_fd, video_audio_path = tempfile.mkstemp(suffix='.wav') os.close(tmp_fd) cmd = ['ffmpeg', '-y', '-i', video, '-vn', '-ar', '22050', '-ac', '1', '-f', 'wav', video_audio_path] ffmpeg_result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if ffmpeg_result.returncode == 0 and os.path.exists(video_audio_path): video_emotion = _run_vocal_emotion(video_audio_path, use_demucs=use_demucs) if video_emotion and not video_emotion.get('error'): video_result['vocal_emotion'] = video_emotion except (FileNotFoundError, subprocess.TimeoutExpired): pass finally: if video_audio_path and os.path.exists(video_audio_path): try: os.unlink(video_audio_path) except OSError: pass result['video'] = video_result except Exception as e: result['video'] = {'error': f'video analysis failed: {e}'} if image: try: from analyze_image import analyze_image result['image'] = analyze_image( image, use_ocr=image_ocr, use_faces=image_faces, vlm_caption=image_vlm, ) except Exception as e: result['image'] = {'error': f'image analysis failed: {e}'} finally: for tmp_path in youtube_tmpfiles: try: if os.path.exists(tmp_path): os.unlink(tmp_path) except OSError: pass return result def main(): parser = argparse.ArgumentParser(description='Analyze any music input(s) and produce a unified analysis') parser.add_argument('--audio', action='append', default=[], help='Audio file path (can be specified multiple times for mashup)') parser.add_argument('--video', help='Video file path') parser.add_argument('--image', help='Image file path (album art, etc.)') parser.add_argument('--youtube', action='append', default=[], help='YouTube URL (will be downloaded)') parser.add_argument('--name-a', help='Name/label for the first song (content source)') parser.add_argument('--name-b', help='Name/label for the second song (style reference)') parser.add_argument('--lyrics', action='store_true', help='Run Whisper lyrics extraction on audio inputs (slower)') parser.add_argument('--lyrics-model', default='base', choices=['tiny', 'base', 'small', 'medium', 'large'], help='Whisper model size for lyrics extraction (default: base)') parser.add_argument('--lyrics-source', default='whisper', choices=['whisper', 'web', 'auto', 'off'], help='Lyrics source strategy. "whisper" = always use Whisper. ' '"web" = always try LRCLib. "auto" = Whisper first, then ' 'optionally fall back to LRCLib if the song is recognized. ' '"off" = skip lyrics entirely. (default: whisper)') parser.add_argument('--lyrics-web-min-match', type=float, default=0.6, help='Minimum word-overlap score to accept web-fetched lyrics ' '(0.0-1.0, default 0.6). Lower = more permissive.') parser.add_argument('--use-demucs', action='store_true', help='Run Demucs source separation first, then analyze the vocal stem. ' 'Dramatically improves pitch/HNR/silence detection on busy mixes. ' 'CPU: ~30s for 4min song, GPU: ~5s. Requires: pip install demucs') parser.add_argument('--no-advanced', action='store_true', help='Skip beat_this, Basic Pitch, MERT, and AST analyses (faster, ' 'less rich). Default: run all advanced analyses.') parser.add_argument('--vlm', action='store_true', help='For image inputs, also call "mmx vision describe" (MiniMax 3.0) ' 'for a free-form caption. Requires the mmx CLI.') parser.add_argument('--ocr', action='store_true', help='For image inputs, also run RapidOCR text extraction.') parser.add_argument('--faces', action='store_true', help='For image inputs, also run MediaPipe face detection.') parser.add_argument('--output', '-o', help='Output JSON file path') args = parser.parse_args() if not (args.audio or args.video or args.image or args.youtube): parser.error('At least one input is required: --audio, --video, --image, or --youtube') audio_source_count = len(args.audio) + len(args.youtube) if audio_source_count > 2: parser.error( f'At most two audio sources are supported for mashup analysis; ' f'got {audio_source_count}. Split the request or treat extra sources as text-only references.' ) result = orchestrate( audios=tuple(args.audio), video=args.video, image=args.image, youtubes=tuple(args.youtube), name_a=args.name_a, name_b=args.name_b, run_lyrics=args.lyrics, lyrics_model=args.lyrics_model, use_demucs=args.use_demucs, run_advanced=not args.no_advanced, image_ocr=args.ocr, image_faces=args.faces, image_vlm=args.vlm, lyrics_source=args.lyrics_source, lyrics_web_min_match=args.lyrics_web_min_match, ) class _NumpyEncoder(json.JSONEncoder): def default(self, o): # noqa: ARG002 (signature must match JSONEncoder) try: import numpy as np if isinstance(o, (np.integer,)): return int(o) if isinstance(o, (np.floating,)): return float(o) if isinstance(o, np.ndarray): return o.tolist() except ImportError: pass return super().default(o) json_out = json.dumps(result, indent=2, ensure_ascii=False, cls=_NumpyEncoder) if args.output: with open(args.output, 'w') as f: f.write(json_out) print(f"Written: {args.output}", file=sys.stderr) else: print(json_out) if __name__ == '__main__': main()