#!/usr/bin/env python3
"""
Parse Markdown for X Articles publishing.
Extracts:
- title
- cover image
- content images with block_index
- dividers with block_index
- rich-text HTML content
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import re
import sys
import tempfile
import urllib.parse
import urllib.request
from pathlib import Path
SEARCH_DIRS = [
Path.home() / "Downloads",
Path.home() / "Desktop",
Path.home() / "Pictures",
]
FRONTMATTER_COVER_KEYS = ("cover_image", "coverImage", "cover", "image", "featureImage", "feature_image")
REMOTE_IMAGE_DIR = Path(tempfile.gettempdir()) / "x-article-publisher-images"
def parse_frontmatter(content: str) -> tuple[dict[str, str], str]:
lines = content.splitlines()
if not lines or lines[0].strip() != "---":
return {}, content
for idx in range(1, len(lines)):
if lines[idx].strip() == "---":
metadata: dict[str, str] = {}
for raw_line in lines[1:idx]:
line = raw_line.strip()
if not line or line.startswith("#") or ":" not in raw_line:
continue
key, value = raw_line.split(":", 1)
key = key.strip()
value = value.strip()
if not key:
continue
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
value = value[1:-1]
metadata[key] = value.strip()
return metadata, "\n".join(lines[idx + 1 :]).lstrip()
return {}, content
def pick_frontmatter_value(metadata: dict[str, str], keys: tuple[str, ...]) -> str | None:
for key in keys:
value = metadata.get(key)
if value:
return value
return None
def extension_from_content_type(content_type: str | None) -> str:
mapping = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
}
return mapping.get((content_type or "").lower(), ".png")
def get_remote_image_path(url: str) -> tuple[str, bool]:
if not url.startswith("https://"):
print(f"[parse_markdown] WARNING: skipping non-HTTPS image: '{url}'", file=sys.stderr)
return url, False
REMOTE_IMAGE_DIR.mkdir(parents=True, exist_ok=True)
parsed = urllib.parse.urlparse(url)
suffix = Path(parsed.path).suffix.lower()
if suffix not in {".png", ".jpg", ".jpeg", ".gif", ".webp"}:
suffix = ""
digest = hashlib.md5(url.encode("utf-8")).hexdigest()[:10]
existing = sorted(REMOTE_IMAGE_DIR.glob(f"remote_{digest}.*"))
if existing:
return str(existing[0]), True
try:
request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(request, timeout=30) as response:
if not suffix:
suffix = extension_from_content_type(response.headers.get_content_type())
output_path = REMOTE_IMAGE_DIR / f"remote_{digest}{suffix}"
output_path.write_bytes(response.read())
return str(output_path), True
except Exception as exc:
print(f"[parse_markdown] WARNING: failed to download '{url}': {exc}", file=sys.stderr)
return url, False
def resolve_image_path(image_reference: str, base_path: Path) -> tuple[str, str, bool]:
if image_reference.startswith(("https://", "http://")):
path, exists = get_remote_image_path(image_reference)
return path, image_reference, exists
decoded_reference = urllib.parse.unquote(image_reference)
resolved_path = str(base_path / decoded_reference) if not os.path.isabs(decoded_reference) else decoded_reference
filename = os.path.basename(decoded_reference)
full_path, exists = find_image_file(resolved_path, filename)
return full_path, resolved_path, exists
def find_image_file(original_path: str, filename: str) -> tuple[str, bool]:
if os.path.isfile(original_path):
return original_path, True
for search_dir in SEARCH_DIRS:
candidate = search_dir / filename
if candidate.is_file():
print(
f"[parse_markdown] Image not found at '{original_path}', using '{candidate}' instead",
file=sys.stderr,
)
return str(candidate), True
print(
f"[parse_markdown] WARNING: Image not found: '{original_path}' "
f"(also searched {[str(d) for d in SEARCH_DIRS]})",
file=sys.stderr,
)
return original_path, False
def split_into_blocks(markdown: str) -> list[str]:
blocks: list[str] = []
current_block: list[str] = []
in_code_block = False
code_block_lines: list[str] = []
for line in markdown.split("\n"):
stripped = line.strip()
if stripped.startswith("```"):
if in_code_block:
in_code_block = False
if code_block_lines:
blocks.append("___CODE_BLOCK_START___" + "\n".join(code_block_lines) + "___CODE_BLOCK_END___")
code_block_lines = []
else:
if current_block:
blocks.append("\n".join(current_block))
current_block = []
in_code_block = True
continue
if in_code_block:
code_block_lines.append(line)
continue
if not stripped:
if current_block:
blocks.append("\n".join(current_block))
current_block = []
continue
if re.match(r"^---+$", stripped):
if current_block:
blocks.append("\n".join(current_block))
current_block = []
blocks.append("___DIVIDER___")
continue
if stripped.startswith(("#", ">")):
if current_block:
blocks.append("\n".join(current_block))
current_block = []
blocks.append(stripped)
continue
if re.match(r"^!\[.*\]\(.*\)$", stripped):
if current_block:
blocks.append("\n".join(current_block))
current_block = []
blocks.append(stripped)
continue
current_block.append(line)
if current_block:
blocks.append("\n".join(current_block))
if code_block_lines:
blocks.append("___CODE_BLOCK_START___" + "\n".join(code_block_lines) + "___CODE_BLOCK_END___")
return blocks
def extract_images_and_dividers(markdown: str, base_path: Path) -> tuple[list[dict], list[dict], str, int]:
blocks = split_into_blocks(markdown)
images: list[dict] = []
dividers: list[dict] = []
clean_blocks: list[str] = []
image_pattern = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)$")
for block in blocks:
stripped = block.strip()
if stripped == "___DIVIDER___":
block_index = len(clean_blocks)
after_text = ""
if clean_blocks:
prev_block = clean_blocks[-1].strip()
lines = [line for line in prev_block.split("\n") if line.strip()]
after_text = lines[-1][:80] if lines else ""
dividers.append({"block_index": block_index, "after_text": after_text})
continue
match = image_pattern.match(stripped)
if match:
alt_text = match.group(1)
image_path = match.group(2)
full_path, original_path, exists = resolve_image_path(image_path, base_path)
block_index = len(clean_blocks)
after_text = ""
if clean_blocks:
prev_block = clean_blocks[-1].strip()
lines = [line for line in prev_block.split("\n") if line.strip()]
after_text = lines[-1][:80] if lines else ""
images.append(
{
"path": full_path,
"original_path": original_path,
"exists": exists,
"alt": alt_text,
"block_index": block_index,
"after_text": after_text,
}
)
else:
clean_blocks.append(block)
return images, dividers, "\n\n".join(clean_blocks), len(clean_blocks)
def extract_title(markdown: str) -> tuple[str, str]:
lines = markdown.strip().split("\n")
title = "Untitled"
title_line_idx: int | None = None
for idx, line in enumerate(lines):
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("# "):
title = stripped[2:].strip()
title_line_idx = idx
break
if stripped.startswith("## "):
title = stripped[3:].strip()
break
if not stripped.startswith("!["):
title = stripped[:100]
break
if title_line_idx is not None:
lines.pop(title_line_idx)
markdown = "\n".join(lines)
return title, markdown
def markdown_to_html(markdown: str) -> str:
html = markdown
def convert_code_block(match: re.Match[str]) -> str:
code_content = match.group(1)
lines = code_content.strip().split("\n")
formatted = "
".join(line for line in lines if line.strip())
return f"
{formatted}" html = re.sub(r"___CODE_BLOCK_START___(.*?)___CODE_BLOCK_END___", convert_code_block, html, flags=re.DOTALL) html = re.sub(r"^## (.+)$", r"
\1", html, flags=re.MULTILINE) html = re.sub(r"^- (.+)$", r"
", "", "
")): processed.append(part) else: processed.append(f"
{part.replace(chr(10), '
") return "".join(processed) def parse_markdown_file(filepath: str, title_override: str | None = None, cover_override: str | None = None) -> dict: path = Path(filepath) base_path = path.parent metadata, content = parse_frontmatter(path.read_text(encoding="utf-8")) title, content = extract_title(content) if title_override: title = title_override elif frontmatter_title := metadata.get("title"): title = frontmatter_title images, dividers, clean_markdown, total_blocks = extract_images_and_dividers(content, base_path) html = markdown_to_html(clean_markdown) cover_image = None cover_exists = True content_images = images[1:] if len(images) > 1 else [] cover_source = cover_override or pick_frontmatter_value(metadata, FRONTMATTER_COVER_KEYS) if cover_source: cover_image, _, cover_exists = resolve_image_path(cover_source, base_path) content_images = images elif images: cover_image = images[0]["path"] cover_exists = images[0]["exists"] missing = [img for img in images if not img["exists"]] if cover_image and not cover_exists: missing = [*missing, {"path": cover_image}] if missing: print(f"[parse_markdown] WARNING: {len(missing)} image(s) not found", file=sys.stderr) return { "title": title, "cover_image": cover_image, "cover_exists": cover_exists, "content_images": content_images, "dividers": dividers, "html": html, "total_blocks": total_blocks, "source_file": str(path.absolute()), "missing_images": len(missing), "cover_source": "frontmatter" if cover_source else ("first_content_image" if cover_image else "none"), } def main() -> int: parser = argparse.ArgumentParser(description="Parse Markdown for X Articles") parser.add_argument("file", help="Markdown file to parse") parser.add_argument("--output", choices=["json", "html"], default="json", help="Output format") parser.add_argument("--html-only", action="store_true", help="Output only HTML content") parser.add_argument("--title", help="Override title from markdown/frontmatter") parser.add_argument("--cover", help="Override cover image from frontmatter") parser.add_argument("--save-html", help="Save HTML output to a file") args = parser.parse_args() if not os.path.exists(args.file): print(f"Error: File not found: {args.file}", file=sys.stderr) return 1 result = parse_markdown_file(args.file, title_override=args.title, cover_override=args.cover) if args.save_html: Path(args.save_html).write_text(result["html"], encoding="utf-8") print(f"[parse_markdown] HTML saved to: {args.save_html}", file=sys.stderr) if args.html_only or args.output == "html": print(result["html"]) else: print(json.dumps(result, ensure_ascii=False, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())
')}