#!/usr/bin/env python3 """analyze_audio.py — Extract musical features for song parody generation. Usage: python3 analyze_audio.py Output: JSON with BPM, key, duration, energy profile, section estimates. """ import sys import json import librosa import numpy as np # Optional: transformers for CLAP zero-shot audio classification try: from transformers import pipeline HAS_TRANSFORMERS = True except ImportError: HAS_TRANSFORMERS = False # Shared audio feature utilities (key detection, tempo/energy classification, # section estimation). See _audio_features.py for the canonical implementations. from _audio_features import ( detect_key, estimate_sections, classify_tempo, classify_energy, ) # Default hop length for librosa energy/feature analysis. Used to convert frame indices # back to time (frame * HOP_LENGTH / sr = seconds). Must match the hop_length used # by librosa.feature.* in this file. HOP_LENGTH = 512 def analyze(audio_path): print(f"Loading: {audio_path}", file=sys.stderr) y, sr = librosa.load(audio_path, sr=22050) # 1. Tempo / BPM tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) bpm = float(np.atleast_1d(tempo)[0]) beat_times = librosa.frames_to_time(beat_frames, sr=sr) beat_intervals = np.diff(beat_times) if len(beat_times) > 1 else np.array([0.5]) tempo_consistency = float(1.0 - min(np.std(beat_intervals) / (np.mean(beat_intervals) + 1e-8), 1.0)) # 2. Key detection chroma = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_avg = np.mean(chroma, axis=1) key, key_confidence = detect_key(chroma_avg) # 3. Duration duration = len(y) / sr # 4. RMS energy profile (20 samples across duration) rms = librosa.feature.rms(y=y)[0] rms_resampled = np.interp( np.linspace(0, len(rms) - 1, 20), np.arange(len(rms)), rms ) energy_profile = [round(float(e), 5) for e in rms_resampled] energy_description = classify_energy(rms_resampled) # 5. Section estimation sections = estimate_sections(rms, sr, duration, hop_length=HOP_LENGTH) # 6. Onset density (notes per second — proxy for complexity) onset_frames = librosa.onset.onset_detect(y=y, sr=sr) onset_density = len(onset_frames) / duration # 7. Spectral features (brightness) spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] avg_brightness = float(np.mean(spectral_centroid)) brightness = "bright/treble-heavy" if avg_brightness > 3000 else \ "balanced" if avg_brightness > 1500 else "warm/bass-heavy" # 8. Tempo feel tempo_feel = classify_tempo(bpm) # 9. CLAP zero-shot classification (genre / mood / instruments / vocal style / era) if HAS_TRANSFORMERS: try: classifier = pipeline("zero-shot-audio-classification", model="laion/clap-htsat-unfused") candidate_labels = [ # Genres "pop", "rock", "jazz", "blues", "classical", "electronic", "hip hop", "rnb", "country", "folk", "metal", "punk", "reggae", "latin", "soul", "funk", "disco", "house", "techno", "ambient", "indie", "alternative", "dream pop", "synthwave", "post-rock", "shoegaze", "lo-fi", # Moods "happy", "sad", "melancholic", "euphoric", "aggressive", "tender", "dark", "bright", "nostalgic", "epic", "peaceful", "tense", # Instruments "piano", "guitar", "drums", "bass", "synthesizer", "strings", "brass", "vocals", "choir", "organ", "saxophone", "violin", # Vocal style "female vocals", "male vocals", "duet", "choir vocals", # Era "1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "2020s", ] results = classifier(audio_path, candidate_labels=candidate_labels) # Take top results per category genre_labels = ["pop", "rock", "jazz", "blues", "classical", "electronic", "hip hop", "rnb", "country", "folk", "metal", "punk", "reggae", "latin", "soul", "funk", "disco", "house", "techno", "ambient", "indie", "alternative", "dream pop", "synthwave", "post-rock", "shoegaze", "lo-fi"] mood_labels = ["happy", "sad", "melancholic", "euphoric", "aggressive", "tender", "dark", "bright", "nostalgic", "epic", "peaceful", "tense"] instrument_labels = ["piano", "guitar", "drums", "bass", "synthesizer", "strings", "brass", "vocals", "choir", "organ", "saxophone", "violin"] vocal_labels = ["female vocals", "male vocals", "duet", "choir vocals"] era_labels = ["1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "2020s"] def top_n(results, labels, n=3): filtered = [(r['label'], round(float(r['score']), 3)) for r in results if r['label'] in labels] filtered.sort(key=lambda x: x[1], reverse=True) return filtered[:n] clap_data = { "detected": True, "top_genres": top_n(results, genre_labels), "top_moods": top_n(results, mood_labels), "top_instruments": top_n(results, instrument_labels), "top_vocal_style": top_n(results, vocal_labels, n=2), "top_era": top_n(results, era_labels, n=2), } except Exception as e: clap_data = {"detected": False, "error": str(e)} else: clap_data = {"note": "transformers not installed — install with: pip install transformers torch"} return { "bpm": round(bpm, 1), "tempo_feel": tempo_feel, "tempo_consistency": round(tempo_consistency, 2), "estimated_key": key, "key_confidence": key_confidence, "duration_seconds": round(duration, 1), "duration_formatted": f"{int(duration // 60)}:{int(duration % 60):02d}", "beat_count": len(beat_frames), "onset_density": round(onset_density, 2), "brightness": brightness, "energy_description": energy_description, "energy_profile": energy_profile, "approximate_sections": sections, "clap_classification": clap_data, } if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: analyze_audio.py ", file=sys.stderr) sys.exit(1) result = analyze(sys.argv[1]) print(json.dumps(result, indent=2, ensure_ascii=False))