#!/usr/bin/env python3 """ Parse Markdown for X Articles publishing. Extracts: - title - cover image - content images with block_index - dividers with block_index - rich-text HTML content """ from __future__ import annotations import argparse import hashlib import json import os import re import sys import tempfile import urllib.parse import urllib.request from pathlib import Path SEARCH_DIRS = [ Path.home() / "Downloads", Path.home() / "Desktop", Path.home() / "Pictures", ] FRONTMATTER_COVER_KEYS = ("cover_image", "coverImage", "cover", "image", "featureImage", "feature_image") REMOTE_IMAGE_DIR = Path(tempfile.gettempdir()) / "x-article-publisher-images" def parse_frontmatter(content: str) -> tuple[dict[str, str], str]: lines = content.splitlines() if not lines or lines[0].strip() != "---": return {}, content for idx in range(1, len(lines)): if lines[idx].strip() == "---": metadata: dict[str, str] = {} for raw_line in lines[1:idx]: line = raw_line.strip() if not line or line.startswith("#") or ":" not in raw_line: continue key, value = raw_line.split(":", 1) key = key.strip() value = value.strip() if not key: continue if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}: value = value[1:-1] metadata[key] = value.strip() return metadata, "\n".join(lines[idx + 1 :]).lstrip() return {}, content def pick_frontmatter_value(metadata: dict[str, str], keys: tuple[str, ...]) -> str | None: for key in keys: value = metadata.get(key) if value: return value return None def extension_from_content_type(content_type: str | None) -> str: mapping = { "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp", } return mapping.get((content_type or "").lower(), ".png") def get_remote_image_path(url: str) -> tuple[str, bool]: if not url.startswith("https://"): print(f"[parse_markdown] WARNING: skipping non-HTTPS image: '{url}'", file=sys.stderr) return url, False REMOTE_IMAGE_DIR.mkdir(parents=True, exist_ok=True) parsed = urllib.parse.urlparse(url) suffix = Path(parsed.path).suffix.lower() if suffix not in {".png", ".jpg", ".jpeg", ".gif", ".webp"}: suffix = "" digest = hashlib.md5(url.encode("utf-8")).hexdigest()[:10] existing = sorted(REMOTE_IMAGE_DIR.glob(f"remote_{digest}.*")) if existing: return str(existing[0]), True try: request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(request, timeout=30) as response: if not suffix: suffix = extension_from_content_type(response.headers.get_content_type()) output_path = REMOTE_IMAGE_DIR / f"remote_{digest}{suffix}" output_path.write_bytes(response.read()) return str(output_path), True except Exception as exc: print(f"[parse_markdown] WARNING: failed to download '{url}': {exc}", file=sys.stderr) return url, False def resolve_image_path(image_reference: str, base_path: Path) -> tuple[str, str, bool]: if image_reference.startswith(("https://", "http://")): path, exists = get_remote_image_path(image_reference) return path, image_reference, exists decoded_reference = urllib.parse.unquote(image_reference) resolved_path = str(base_path / decoded_reference) if not os.path.isabs(decoded_reference) else decoded_reference filename = os.path.basename(decoded_reference) full_path, exists = find_image_file(resolved_path, filename) return full_path, resolved_path, exists def find_image_file(original_path: str, filename: str) -> tuple[str, bool]: if os.path.isfile(original_path): return original_path, True for search_dir in SEARCH_DIRS: candidate = search_dir / filename if candidate.is_file(): print( f"[parse_markdown] Image not found at '{original_path}', using '{candidate}' instead", file=sys.stderr, ) return str(candidate), True print( f"[parse_markdown] WARNING: Image not found: '{original_path}' " f"(also searched {[str(d) for d in SEARCH_DIRS]})", file=sys.stderr, ) return original_path, False def split_into_blocks(markdown: str) -> list[str]: blocks: list[str] = [] current_block: list[str] = [] in_code_block = False code_block_lines: list[str] = [] for line in markdown.split("\n"): stripped = line.strip() if stripped.startswith("```"): if in_code_block: in_code_block = False if code_block_lines: blocks.append("___CODE_BLOCK_START___" + "\n".join(code_block_lines) + "___CODE_BLOCK_END___") code_block_lines = [] else: if current_block: blocks.append("\n".join(current_block)) current_block = [] in_code_block = True continue if in_code_block: code_block_lines.append(line) continue if not stripped: if current_block: blocks.append("\n".join(current_block)) current_block = [] continue if re.match(r"^---+$", stripped): if current_block: blocks.append("\n".join(current_block)) current_block = [] blocks.append("___DIVIDER___") continue if stripped.startswith(("#", ">")): if current_block: blocks.append("\n".join(current_block)) current_block = [] blocks.append(stripped) continue if re.match(r"^!\[.*\]\(.*\)$", stripped): if current_block: blocks.append("\n".join(current_block)) current_block = [] blocks.append(stripped) continue current_block.append(line) if current_block: blocks.append("\n".join(current_block)) if code_block_lines: blocks.append("___CODE_BLOCK_START___" + "\n".join(code_block_lines) + "___CODE_BLOCK_END___") return blocks def extract_images_and_dividers(markdown: str, base_path: Path) -> tuple[list[dict], list[dict], str, int]: blocks = split_into_blocks(markdown) images: list[dict] = [] dividers: list[dict] = [] clean_blocks: list[str] = [] image_pattern = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)$") for block in blocks: stripped = block.strip() if stripped == "___DIVIDER___": block_index = len(clean_blocks) after_text = "" if clean_blocks: prev_block = clean_blocks[-1].strip() lines = [line for line in prev_block.split("\n") if line.strip()] after_text = lines[-1][:80] if lines else "" dividers.append({"block_index": block_index, "after_text": after_text}) continue match = image_pattern.match(stripped) if match: alt_text = match.group(1) image_path = match.group(2) full_path, original_path, exists = resolve_image_path(image_path, base_path) block_index = len(clean_blocks) after_text = "" if clean_blocks: prev_block = clean_blocks[-1].strip() lines = [line for line in prev_block.split("\n") if line.strip()] after_text = lines[-1][:80] if lines else "" images.append( { "path": full_path, "original_path": original_path, "exists": exists, "alt": alt_text, "block_index": block_index, "after_text": after_text, } ) else: clean_blocks.append(block) return images, dividers, "\n\n".join(clean_blocks), len(clean_blocks) def extract_title(markdown: str) -> tuple[str, str]: lines = markdown.strip().split("\n") title = "Untitled" title_line_idx: int | None = None for idx, line in enumerate(lines): stripped = line.strip() if not stripped: continue if stripped.startswith("# "): title = stripped[2:].strip() title_line_idx = idx break if stripped.startswith("## "): title = stripped[3:].strip() break if not stripped.startswith("!["): title = stripped[:100] break if title_line_idx is not None: lines.pop(title_line_idx) markdown = "\n".join(lines) return title, markdown def markdown_to_html(markdown: str) -> str: html = markdown def convert_code_block(match: re.Match[str]) -> str: code_content = match.group(1) lines = code_content.strip().split("\n") formatted = "
".join(line for line in lines if line.strip()) return f"
{formatted}
" html = re.sub(r"___CODE_BLOCK_START___(.*?)___CODE_BLOCK_END___", convert_code_block, html, flags=re.DOTALL) html = re.sub(r"^## (.+)$", r"

\1

", html, flags=re.MULTILINE) html = re.sub(r"^### (.+)$", r"

\1

", html, flags=re.MULTILINE) html = re.sub(r"\*\*(.+?)\*\*", r"\1", html) html = re.sub(r"\*([^*]+)\*", r"\1", html) html = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r'\1', html) html = re.sub(r"^> (.+)$", r"
\1
", html, flags=re.MULTILINE) html = re.sub(r"^- (.+)$", r"
  • \1
  • ", html, flags=re.MULTILINE) html = re.sub(r"^\d+\. (.+)$", r"
  • \1
  • ", html, flags=re.MULTILINE) html = re.sub(r"((?:
  • .*?
  • \n?)+)", r"", html) parts = html.split("\n\n") processed: list[str] = [] for part in parts: part = part.strip() if not part: continue if part.startswith(("

    ", "

    ", "
    ", "