#!/usr/bin/env python3 """ Single-pass video renderer. Selects segments, applies subtitles and cover in ONE encoding pass — no intermediate re-encodes. Usage: python3 render_final.py --config render_config.json --output final.mp4 The config JSON format: { "clips": [ {"video": "path/to/video1.MOV", "segment_id": 4, "transcript": "path/to/transcript1.json"}, {"video": "path/to/video1.MOV", "segment_id": 5, "transcript": "path/to/transcript1.json"}, {"video": "path/to/video2.MOV", "segment_id": 1, "transcript": "path/to/transcript2.json"} ], "title": "封面标题", "chapters": [ {"title": "痛点", "start": 0.0, "end": 27.5}, ... ] } """ import argparse import json import os import subprocess import sys import tempfile sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from utils import ( find_chinese_font, get_video_info, get_ffmpeg_encode_args, escape_ffmpeg_path, sanitize_title, detect_gpu, ) from burn_subtitles import ( detect_language, escape_ass_text, wrap_subtitle_text, ) from generate_cover_image import generate_cover as generate_cover_image # --- Caption style presets --- CAPTION_PRESETS = { "normal": { "primary": "&H00FFFFFF", "outline": "&H00000000", "outline_w": 3, "shadow": 1, "bold": 1, }, "bold_pop": { "primary": "&H00FFFFFF", "outline": "&H00000000", "outline_w": 6, "shadow": 3, "bold": 1, }, "neon": { "primary": "&H00FFFF00", "outline": "&H00FF00FF", "outline_w": 4, "shadow": 0, "bold": 1, }, "minimal": { "primary": "&H00FFFFFF", "outline": "&H00000000", "outline_w": 0, "shadow": 2, "bold": 0, }, "yellow_pop": { "primary": "&H0000FFFF", "outline": "&H00000000", "outline_w": 4, "shadow": 1, "bold": 1, }, } # --- Multi-platform output formats --- OUTPUT_FORMATS = { "vertical": {"width": 1080, "height": 1920, "label": "9:16 (抖音/小红书/TikTok)"}, "square": {"width": 1080, "height": 1080, "label": "1:1 (Instagram)"}, "horizontal": {"width": 1920, "height": 1080, "label": "16:9 (YouTube/B站)"}, } def build_reformat_filter(src_w, src_h, dst_w, dst_h): """Build ffmpeg filter to reformat video dimensions via center-crop.""" src_ratio = src_w / src_h dst_ratio = dst_w / dst_h if abs(src_ratio - dst_ratio) < 0.01: return f"scale={dst_w}:{dst_h}" elif src_ratio > dst_ratio: return f"scale=-1:{dst_h},crop={dst_w}:{dst_h}" else: return f"scale={dst_w}:-1,crop={dst_w}:{dst_h}" def load_config(config_path): try: with open(config_path, encoding="utf-8") as f: return json.load(f) except json.JSONDecodeError as e: print(f"Error: Invalid JSON in {config_path}: {e}", file=sys.stderr) sys.exit(1) except FileNotFoundError: print(f"Error: Config file not found: {config_path}", file=sys.stderr) sys.exit(1) def resolve_clips(config): """Resolve clip entries to (video_path, start, end, text) tuples.""" transcript_cache = {} clips = [] errors = [] for i, entry in enumerate(config["clips"]): video = os.path.abspath(entry["video"]) transcript = os.path.abspath(entry["transcript"]) seg_id = entry["segment_id"] if not os.path.isfile(video): errors.append(f"Clip #{i+1}: video not found: {video}") continue if not os.path.isfile(transcript): errors.append(f"Clip #{i+1}: transcript not found: {transcript}") continue if transcript not in transcript_cache: with open(transcript, encoding="utf-8") as f: data = json.load(f) transcript_cache[transcript] = {s["id"]: s for s in data["segments"]} if seg_id not in transcript_cache[transcript]: errors.append(f"Clip #{i+1}: segment_id {seg_id} not found in {os.path.basename(transcript)}") continue seg = transcript_cache[transcript][seg_id] resolved = { "video": video, "start": seg["start"], "end": seg["end"], "text": seg["text"], } if "words" in seg: resolved["words"] = seg["words"] if "broll" in entry: resolved["broll"] = os.path.abspath(entry["broll"]) resolved["broll_start"] = entry.get("broll_start", 0.0) clips.append(resolved) if errors: print("Config validation errors:", file=sys.stderr) for e in errors: print(f" {e}", file=sys.stderr) sys.exit(1) return clips def build_merged_ass(clips, font_name, font_size, video_width, video_height, speed=1.0, cover_duration=0.0, end_cards=None, subtitle_style="normal"): """Build a single ASS subtitle file covering the entire merged timeline. Args: cover_duration: Seconds of cover at the start; subtitles begin after this. end_cards: List of {"text": str, "duration": float} for ending cards. subtitle_style: Caption preset name (normal/bold_pop/neon/minimal/yellow_pop). """ margin_lr = 60 usable_width = video_width - 2 * margin_lr margin_v = int(video_height * 0.28) end_card_fs = int(font_size * 1.4) # Apply caption preset preset = CAPTION_PRESETS.get(subtitle_style, CAPTION_PRESETS["normal"]) p_color = preset["primary"] o_color = preset["outline"] o_width = preset["outline_w"] s_depth = preset["shadow"] bold = preset["bold"] def fmt_time(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = seconds % 60 return f"{h}:{m:02d}:{s:05.2f}" header = f"""[Script Info] Title: Merged Subtitles ScriptType: v4.00+ PlayResX: {video_width} PlayResY: {video_height} WrapStyle: 0 [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,{font_name},{font_size},{p_color},&H000000FF,{o_color},&H80000000,{bold},0,0,0,100,100,0,0,1,{o_width},{s_depth},2,{margin_lr},{margin_lr},{margin_v},1 Style: EndCard,{font_name},{end_card_fs},&H00FFFFFF,&H000000FF,&H00000000,&H00000000,1,0,0,0,100,100,2,0,0,0,0,5,{margin_lr},{margin_lr},0,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ dialogues = [] offset = cover_duration # Start subtitles after cover for clip in clips: dur = clip["end"] - clip["start"] text = clip["text"] lang = detect_language(text) if lang == "zh": max_chars = int(usable_width / font_size) else: max_chars = int(usable_width / (font_size * 0.55)) wrapped = wrap_subtitle_text(text, max_chars, lang) escaped = escape_ass_text(wrapped) scaled_dur = dur / speed start_t = fmt_time(offset) end_t = fmt_time(offset + scaled_dur) dialogues.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{escaped}") offset += scaled_dur # End cards: centered text on black screen with fade end_cards_duration = 0.0 if end_cards: for card in end_cards: card_text = card["text"] card_dur = card.get("duration", 3.0) fade_in = 300 # ms fade_out = 300 # ms start_t = fmt_time(offset) end_t = fmt_time(offset + card_dur) escaped = escape_ass_text(card_text) escaped = escaped.replace("\n", "\\N") dialogues.append( f"Dialogue: 0,{start_t},{end_t},EndCard,,0,0,0,,{{\\fad({fade_in},{fade_out})}}{escaped}" ) offset += card_dur end_cards_duration += card_dur return header + "\n".join(dialogues) + "\n", offset, end_cards_duration def _ass_color(hex_color): """Convert '#RRGGBB' or '#AARRGGBB' to ASS '&HAABBGGRR' format.""" h = hex_color.lstrip("#") if len(h) == 6: r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) return f"&H00{b:02X}{g:02X}{r:02X}" elif len(h) == 8: a, r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16), int(h[6:8], 16) return f"&H{a:02X}{b:02X}{g:02X}{r:02X}" return "&H0000FFFF" # fallback yellow def _build_karaoke_line(clip, seg_start): """Build ASS karaoke text from word timestamps. Uses \\kf (smooth fill) for each word. If word timestamps are missing, falls back to even distribution across characters. """ words = clip.get("words") text = clip["text"] seg_duration = clip["end"] - clip["start"] if words: parts = [] prev_end = 0.0 # relative to segment start for w in words: word_rel_end = w["end"] - seg_start # Duration from previous word end to this word's end (in centiseconds) kf_cs = max(1, round((word_rel_end - prev_end) * 100)) escaped = escape_ass_text(w["word"]) parts.append(f"{{\\kf{kf_cs}}}{escaped}") prev_end = word_rel_end return "".join(parts) else: # Fallback: distribute evenly across characters chars = list(text) if not chars: return escape_ass_text(text) per_char_cs = max(1, round(seg_duration * 100 / len(chars))) parts = [f"{{\\kf{per_char_cs}}}{escape_ass_text(c)}" for c in chars] return "".join(parts) def build_karaoke_ass(clips, font_name, font_size, video_width, video_height, speed=1.0, cover_duration=0.0, end_cards=None, highlight_color="#FFFF00", base_color="#FFFFFF", base_alpha="80"): """Build ASS subtitle file with karaoke word-by-word highlighting. Uses ASS \\kf tags: text starts in SecondaryColour (base/dim) and fills to PrimaryColour (highlight) as each word is spoken. Args: highlight_color: Hex color for the active/highlighted word (default yellow). base_color: Hex color for words not yet spoken (default white). base_alpha: Alpha hex for base color (00=opaque, FF=transparent, default 80=semi). """ margin_lr = 60 margin_v = int(video_height * 0.28) end_card_fs = int(font_size * 1.4) # ASS colors: PrimaryColour = after karaoke fill, SecondaryColour = before fill primary = _ass_color(highlight_color) # Base color with alpha for "not yet spoken" dimmed look bh = base_color.lstrip("#") if len(bh) == 6: r, g, b = int(bh[0:2], 16), int(bh[2:4], 16), int(bh[4:6], 16) secondary = f"&H{base_alpha}{b:02X}{g:02X}{r:02X}" else: secondary = f"&H{base_alpha}FFFFFF" def fmt_time(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = seconds % 60 return f"{h}:{m:02d}:{s:05.2f}" header = f"""[Script Info] Title: Karaoke Subtitles ScriptType: v4.00+ PlayResX: {video_width} PlayResY: {video_height} WrapStyle: 0 [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Karaoke,{font_name},{font_size},{primary},{secondary},&H00000000,&H80000000,1,0,0,0,100,100,0,0,1,3,1,2,{margin_lr},{margin_lr},{margin_v},1 Style: EndCard,{font_name},{end_card_fs},&H00FFFFFF,&H000000FF,&H00000000,&H00000000,1,0,0,0,100,100,2,0,0,0,0,5,{margin_lr},{margin_lr},0,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ dialogues = [] offset = cover_duration for clip in clips: dur = clip["end"] - clip["start"] scaled_dur = dur / speed start_t = fmt_time(offset) end_t = fmt_time(offset + scaled_dur) karaoke_text = _build_karaoke_line(clip, clip["start"]) dialogues.append(f"Dialogue: 0,{start_t},{end_t},Karaoke,,0,0,0,,{karaoke_text}") offset += scaled_dur # End cards (same as normal mode) end_cards_duration = 0.0 if end_cards: for card in end_cards: card_text = card["text"] card_dur = card.get("duration", 3.0) start_t = fmt_time(offset) end_t = fmt_time(offset + card_dur) escaped = escape_ass_text(card_text).replace("\n", "\\N") dialogues.append( f"Dialogue: 0,{start_t},{end_t},EndCard,,0,0,0,,{{\\fad(300,300)}}{escaped}" ) offset += card_dur end_cards_duration += card_dur return header + "\n".join(dialogues) + "\n", offset, end_cards_duration def _clips_in_temporal_order(clips): """Check if all clips come from one video and are in temporal order.""" videos = set(c["video"] for c in clips) if len(videos) != 1: return False for i in range(1, len(clips)): if clips[i]["start"] < clips[i - 1]["start"]: return False return True def build_select_filter(clips, fps): """Build filter using select/aselect with between() expressions. Much simpler than trim/concat: one expression selects all segments, FFmpeg decodes the full source but only encodes selected frames. Only works for single-video, temporally-ordered clips. Returns (filter_str, input_files). """ between_exprs = [ f"between(t,{c['start']:.4f},{c['end']:.4f})" for c in clips ] select_expr = "+".join(between_exprs) filters = [ f"[0:v]select='{select_expr}',setpts=N/{fps:.4f}/TB[merged_v]", f"[0:a]aselect='{select_expr}',asetpts=N/SR/TB[merged_a]", ] return ";\n".join(filters), [clips[0]["video"]] def build_trim_filter(clips, target_w=None, target_h=None): """Build filter_complex string for trimming and concatenating clips. Fallback for multi-video or reordered clips where select filter cannot be used. Supports B-roll: clips with a "broll" key use video from the broll source but audio from the original source. B-roll is scaled/cropped to match target_w x target_h. Returns (filter_str, input_files). """ # Deduplicate input files while preserving order input_files = [] input_index = {} for clip in clips: for vpath in [clip["video"], clip.get("broll")]: if vpath and vpath not in input_index: input_index[vpath] = len(input_files) input_files.append(vpath) filters = [] n = len(clips) concat_inputs = "" for i, clip in enumerate(clips): audio_idx = input_index[clip["video"]] broll = clip.get("broll") video_idx = input_index[broll] if broll else audio_idx s = clip["start"] e = clip["end"] dur = e - s if broll: broll_start = clip.get("broll_start", 0.0) broll_end = broll_start + dur filters.append( f"[{video_idx}:v]trim=start={broll_start:.4f}:end={broll_end:.4f},setpts=PTS-STARTPTS,scale={target_w}:{target_h}:force_original_aspect_ratio=increase,crop={target_w}:{target_h}[v{i}]" ) else: filters.append( f"[{video_idx}:v]trim=start={s:.4f}:end={e:.4f},setpts=PTS-STARTPTS[v{i}]" ) filters.append( f"[{audio_idx}:a]atrim=start={s:.4f}:end={e:.4f},asetpts=PTS-STARTPTS[a{i}]" ) concat_inputs += f"[v{i}][a{i}]" filters.append(f"{concat_inputs}concat=n={n}:v=1:a=1[merged_v][merged_a]") return ";\n".join(filters), input_files def generate_cover_png(video_path, title, width, height, temp_files, style="bold", subtitle=None, use_frame=False): """Generate cover PNG using headless Chrome. Returns path to the cover PNG, or None if generation fails. """ if not title: return None fd, cover_path = tempfile.mkstemp(suffix=".png", prefix="cover_") os.close(fd) temp_files.append(cover_path) result = generate_cover_image( video_path, title, output_path=cover_path, width=width, height=height, style=style, subtitle=subtitle, use_frame=use_frame, ) return result def main(): parser = argparse.ArgumentParser(description="Single-pass video renderer") parser.add_argument("--config", required=True, help="Path to render config JSON") parser.add_argument("--output", required=True, help="Output video path") parser.add_argument("--font-path", default=None, help="Custom font path") parser.add_argument("--font-size", type=int, default=48, help="Subtitle font size") parser.add_argument("--no-subtitles", action="store_true") parser.add_argument("--no-cover", action="store_true") parser.add_argument("--speed", nargs="*", type=float, default=[], help="Additional speed variants to render (e.g. --speed 1.25 1.5)") parser.add_argument("--cover-duration", type=float, default=None, help="Cover freeze duration in seconds (default: from config or 2.0)") parser.add_argument("--cleanup", action="store_true", help="Remove temp files after render") parser.add_argument("--bgm", default=None, help="Background music file path (overrides config)") parser.add_argument("--bgm-volume", type=float, default=None, help="BGM volume 0.0-1.0 (default: from config or 0.15)") parser.add_argument("--subtitle-style", default=None, choices=["normal", "karaoke", "bold_pop", "neon", "minimal", "yellow_pop"], help="Subtitle style (default: from config or 'normal')") parser.add_argument("--formats", nargs="*", choices=list(OUTPUT_FORMATS.keys()), help="Additional output formats: vertical, square, horizontal") args = parser.parse_args() config = load_config(args.config) clips = resolve_clips(config) if not clips: print("Error: No clips in config", file=sys.stderr) sys.exit(1) # Get video dimensions from first source first_video = clips[0]["video"] _, width, height, fps, _ = get_video_info(first_video) print(f"Video: {width}x{height}, {fps:.2f}fps") # Scale font size based on shorter side ref_dimension = min(width, height) font_size = int(args.font_size * ref_dimension / 1080) # Find font font_path, font_name = find_chinese_font(args.font_path) print(f"Font: {font_name}") # Subtitle style sub_style = args.subtitle_style or config.get("subtitle_style", "normal") if sub_style == "karaoke": has_words = any("words" in c for c in clips) print(f"Subtitles: karaoke (word-level highlight)") if not has_words: print(" Note: No word timestamps in transcript — using even distribution fallback") # --- Step 1: Build segment selection filter --- if _clips_in_temporal_order(clips): base_filter, input_files = build_select_filter(clips, fps) print(f"Using select filter: {len(clips)} segments from 1 video") else: base_filter, input_files = build_trim_filter(clips, target_w=width, target_h=height) print(f"Using trim/concat filter: {len(clips)} clips from {len(input_files)} video(s)") # Collect all speeds to render (1.0 = original, plus any extras) all_speeds = [1.0] + [s for s in args.speed if s != 1.0] total_duration = sum(c["end"] - c["start"] for c in clips) title = config.get("title", "") chapters = config.get("chapters", []) encode_args = get_ffmpeg_encode_args() output_path = os.path.abspath(args.output) os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) temp_files = [] failed_speeds = [] # Cover duration: CLI arg > config > default 2.0 (0 if no title or --no-cover) if args.cover_duration is not None: cover_duration = args.cover_duration else: cover_duration = config.get("cover_duration", 2.0) if not title or args.no_cover: cover_duration = 0.0 # Generate cover PNG once (reused across all speed variants) cover_png_path = None cover_style = config.get("cover_style", "bold") cover_subtitle = config.get("subtitle", None) use_frame = config.get("cover_use_frame", False) custom_cover = config.get("cover_image", None) if cover_duration > 0 and custom_cover and os.path.isfile(custom_cover): cover_png_path = os.path.abspath(custom_cover) print(f"Cover: {cover_duration:.1f}s freeze + custom image ({custom_cover})") elif cover_duration > 0 and title: cover_png_path = generate_cover_png( clips[0]["video"], title, width, height, temp_files, style=cover_style, subtitle=cover_subtitle, use_frame=use_frame, ) if cover_png_path: print(f"Cover: {cover_duration:.1f}s freeze + Chrome-rendered overlay") else: print(f"Cover: {cover_duration:.1f}s freeze (no title overlay — Chrome not found)") # --- BGM config --- bgm_path = args.bgm or config.get("bgm") if bgm_path and os.path.isfile(bgm_path): bgm_path = os.path.abspath(bgm_path) elif bgm_path: print(f"Warning: BGM file not found: {bgm_path}", file=sys.stderr) bgm_path = None bgm_volume = args.bgm_volume if args.bgm_volume is not None else config.get("bgm_volume", 0.15) bgm_fade_out = config.get("bgm_fade_out", 3.0) if bgm_path: print(f"BGM: {os.path.basename(bgm_path)} (volume={bgm_volume}, fade_out={bgm_fade_out}s)") for speed in all_speeds: if speed == 1.0: out_path = output_path label = "1x" else: base, ext = os.path.splitext(output_path) speed_label = f"{speed}x".replace(".", "_") out_path = f"{base}_{speed_label}{ext}" label = f"{speed}x" effective_duration = total_duration / speed # --- Build subtitle ASS (scaled for speed, offset by cover duration) --- end_cards = config.get("end_cards", None) ass_path = None end_cards_duration = 0.0 subtitle_style = args.subtitle_style or config.get("subtitle_style", "normal") if not args.no_subtitles: if subtitle_style == "karaoke": highlight_color = config.get("subtitle_highlight_color", "#FFFF00") base_color = config.get("subtitle_base_color", "#FFFFFF") base_alpha = config.get("subtitle_base_alpha", "80") ass_content, _, end_cards_duration = build_karaoke_ass( clips, font_name, font_size, width, height, speed=speed, cover_duration=cover_duration, end_cards=end_cards, highlight_color=highlight_color, base_color=base_color, base_alpha=base_alpha, ) else: ass_content, _, end_cards_duration = build_merged_ass( clips, font_name, font_size, width, height, speed=speed, cover_duration=cover_duration, end_cards=end_cards, subtitle_style=subtitle_style, ) fd, ass_path = tempfile.mkstemp(suffix=".ass", prefix=f"sub_{label}_") with os.fdopen(fd, "w", encoding="utf-8") as f: f.write(ass_content) temp_files.append(ass_path) # --- Build video filter chain on [merged_v] --- vf_parts = [] # Speed adjustment (before cover padding so cover stays at normal speed) if speed != 1.0: vf_parts.append(f"setpts=PTS/{speed}") # Cover: freeze first frame for cover_duration seconds if cover_duration > 0: vf_parts.append( f"tpad=start_duration={cover_duration}:start_mode=clone" ) # Subtitles (ASS timing already includes cover offset) if ass_path: escaped_ass = escape_ffmpeg_path(ass_path) if font_path: fonts_dir = escape_ffmpeg_path(os.path.dirname(font_path)) vf_parts.append(f"ass='{escaped_ass}':fontsdir='{fonts_dir}'") else: vf_parts.append(f"ass='{escaped_ass}'") # --- Build audio filter chain on [merged_a] --- af_parts = [] if speed != 1.0: remaining = speed while remaining > 2.0: af_parts.append("atempo=2.0") remaining /= 2.0 af_parts.append(f"atempo={remaining:.4f}") # Audio: add silence for cover duration (after speed adjustment) if cover_duration > 0: delay_ms = int(cover_duration * 1000) af_parts.append(f"adelay={delay_ms}:all=1") # Note: end cards silence is provided by anullsrc in the concat, no apad needed # --- Extra inputs tracking (order matters: must match -i order in cmd) --- extra_inputs = [] # list of (type, idx, path) bgm_input_idx = None if bgm_path: bgm_input_idx = len(input_files) + len(extra_inputs) extra_inputs.append(("bgm", bgm_input_idx, bgm_path)) bgm_total = effective_duration + cover_duration + end_cards_duration cover_input_idx = None if cover_png_path and cover_duration > 0: cover_input_idx = len(input_files) + len(extra_inputs) extra_inputs.append(("cover", cover_input_idx, cover_png_path)) vf_parts.append( f"[cover_img]overlay=0:0:enable='lte(t,{cover_duration:.4f})'") overlay_input_idx = None overlay_path = config.get("video_overlay") if overlay_path and os.path.isfile(overlay_path): overlay_input_idx = len(input_files) + len(extra_inputs) extra_inputs.append(("overlay", overlay_input_idx, os.path.abspath(overlay_path))) vf_parts.append( f"[overlay_img]overlay=0:0:enable='gt(t,{cover_duration:.4f})'") rec_blink = config.get("rec_blink") rec_dot_input_idx = None if rec_blink: dot_path = rec_blink.get("dot_image") if dot_path and os.path.isfile(dot_path): rec_dot_input_idx = len(input_files) + len(extra_inputs) extra_inputs.append(("rec_dot", rec_dot_input_idx, os.path.abspath(dot_path))) bx = rec_blink.get("x", 62) by = rec_blink.get("y", 55) period = rec_blink.get("period", 1.0) half = period / 2 vf_parts.append( f"[rec_dot]overlay={bx}:{by}:enable='if(gt(t,{cover_duration:.1f}),gte(mod(t,{period:.2f}),{half:.2f}),0)'" ) # --- Assemble full filter_complex --- filter_lines = [base_filter] # End cards: concat black frames after merged video if end_cards_duration > 0: fps_val = fps filter_lines.append( f"color=c=black:s={width}x{height}:d={end_cards_duration:.4f}:r={fps_val:.4f}[black_v]" ) filter_lines.append( f"anullsrc=r=48000:cl=stereo:d={end_cards_duration:.4f}[black_a]" ) filter_lines.append( f"[merged_v][merged_a][black_v][black_a]concat=n=2:v=1:a=1[merged_v2][merged_a2]" ) # Replace labels for downstream processing merged_v_label = "[merged_v2]" merged_a_label = "[merged_a2]" else: merged_v_label = "[merged_v]" merged_a_label = "[merged_a]" if vf_parts: # Count how many overlay operations we have at the end overlay_count = sum(1 for x in [cover_input_idx, overlay_input_idx, rec_dot_input_idx] if x is not None) if overlay_count > 0: pre_parts = vf_parts[:-overlay_count] overlay_parts = vf_parts[-overlay_count:] current_label = merged_v_label if pre_parts: pre_chain = ",".join(pre_parts) filter_lines.append(f"{current_label}{pre_chain}[pre_v]") current_label = "[pre_v]" for oi, opart in enumerate(overlay_parts): out_label = f"[ov{oi}]" if oi < len(overlay_parts) - 1 else "[final_v]" filter_lines.append(f"{current_label}{opart}{out_label}") current_label = out_label else: vf_chain = ",".join(vf_parts) filter_lines.append(f"{merged_v_label}{vf_chain}[final_v]") map_v = "[final_v]" else: map_v = merged_v_label if af_parts: af_chain = ",".join(af_parts) filter_lines.append(f"{merged_a_label}{af_chain}[voice_a]") voice_label = "[voice_a]" else: voice_label = merged_a_label # BGM: loop, trim, volume, fade out, then amix with voice if bgm_input_idx is not None: bgm_filters = [ f"aloop=loop=-1:size=2147483647", f"atrim=duration={bgm_total:.4f}", f"asetpts=PTS-STARTPTS", f"volume={bgm_volume:.2f}", ] if bgm_fade_out > 0: fade_start = max(0, bgm_total - bgm_fade_out) bgm_filters.append(f"afade=t=out:st={fade_start:.4f}:d={bgm_fade_out:.4f}") bgm_chain = ",".join(bgm_filters) filter_lines.append(f"[{bgm_input_idx}:a]{bgm_chain}[bgm_a]") filter_lines.append( f"{voice_label}[bgm_a]amix=inputs=2:duration=first:dropout_transition=0[final_a]" ) map_a = "[final_a]" else: map_a = voice_label # Add cover image scaling/labeling if needed if cover_input_idx is not None: cover_prep = f"[{cover_input_idx}:v]scale={width}:{height},format=rgba[cover_img]" filter_lines.insert(1, cover_prep) # Add persistent overlay image scaling/labeling if needed if overlay_input_idx is not None: overlay_prep = f"[{overlay_input_idx}:v]scale={width}:{height},format=rgba[overlay_img]" filter_lines.insert(1 + (1 if cover_input_idx is not None else 0), overlay_prep) # Add REC dot image prep if needed if rec_dot_input_idx is not None: prep_idx = 1 + sum(1 for x in [cover_input_idx, overlay_input_idx] if x is not None) # BGM filter lines are appended (not inserted), so no offset needed here dot_prep = f"[{rec_dot_input_idx}:v]format=rgba[rec_dot]" filter_lines.insert(prep_idx, dot_prep) full_filter = ";\n".join(filter_lines) # Write filter to temp file fd, filter_path = tempfile.mkstemp(suffix=".txt", prefix=f"fc_{label}_") with os.fdopen(fd, "w", encoding="utf-8") as f: f.write(full_filter) temp_files.append(filter_path) # --- Single ffmpeg encode from source --- cmd = ["ffmpeg", "-y"] for inp in input_files: cmd.extend(["-i", inp]) # Add extra inputs in tracked order for etype, eidx, epath in extra_inputs: cmd.extend(["-i", epath]) cmd.extend([ "-filter_complex_script", filter_path, "-map", map_v, "-map", map_a, ]) cmd.extend(encode_args) cmd.extend(["-c:a", "aac", "-b:a", "192k", "-shortest"]) cmd.append(out_path) total_out = effective_duration + cover_duration + end_cards_duration print(f"\nRendering {label} ({total_out:.0f}s)...") try: subprocess.run(cmd, check=True, capture_output=True, text=True) except subprocess.CalledProcessError as e: print(f"FFmpeg error ({label}):\n{e.stderr[-2000:]}", file=sys.stderr) failed_speeds.append(label) continue size_mb = os.path.getsize(out_path) / 1024 / 1024 print(f"Done: {out_path} ({size_mb:.1f}MB)") # Report failures if failed_speeds: print(f"\nWARNING: Failed to render: {', '.join(failed_speeds)}", file=sys.stderr) # Print chapter timeline (for pasting into Xiaohongshu / YouTube etc.) if chapters: print("\n时间轴(可直接复制到小红书):") for ch in chapters: t = ch["start"] + cover_duration m, s = divmod(t, 60) print(f" {int(m)}:{int(s):02d} {ch.get('title', '')}") # --- Multi-platform format export --- base_output = args.output if args.formats and os.path.isfile(base_output): for fmt_name in args.formats: fmt = OUTPUT_FORMATS[fmt_name] fmt_output = base_output.replace(".mp4", f"_{fmt_name}.mp4") reformat = build_reformat_filter(width, height, fmt["width"], fmt["height"]) fmt_cmd = [ "ffmpeg", "-y", "-i", base_output, "-vf", reformat, "-c:v", "libx264", "-crf", "18", "-preset", "medium", "-c:a", "copy", fmt_output, ] print(f"\nRendering {fmt['label']}...") try: subprocess.run(fmt_cmd, check=True, capture_output=True, text=True) size_mb = os.path.getsize(fmt_output) / 1024 / 1024 print(f"Done: {fmt_output} ({size_mb:.1f}MB)") except subprocess.CalledProcessError as e: print(f"Format error ({fmt_name}):\n{e.stderr[-2000:]}", file=sys.stderr) # --- Cleanup --- for p in temp_files: if p and os.path.exists(p): os.remove(p) if temp_files: print("\nTemp files cleaned up.") if __name__ == "__main__": main()