#!/usr/bin/env python3 """Simple CLI extractor using Scrapling. Examples: python scripts/extract_with_scrapling.py --url https://example.com --css "h1::text" python scripts/extract_with_scrapling.py --url https://example.com --css "h1::text" --fetcher dynamic python scripts/extract_with_scrapling.py --url https://example.com --css ".price::text" --fetcher stealthy python scripts/extract_with_scrapling.py --html-file page.html --xpath "//title/text()" """ from __future__ import annotations import argparse import json import logging import sys from pathlib import Path from typing import Any logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s", stream=sys.stderr) log = logging.getLogger(__name__) def _make_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Extract data from HTML using Scrapling.") source = parser.add_mutually_exclusive_group(required=True) source.add_argument("--url", help="HTTP(S) URL to fetch with Scrapling Fetcher.") source.add_argument("--html-file", help="Local HTML file path to parse.") parser.add_argument("--css", help="CSS selector (example: h1::text).") parser.add_argument("--xpath", help="XPath selector (example: //h1/text()).") parser.add_argument( "--fetcher", choices=["static", "dynamic", "stealthy"], default="static", help="Fetcher type: static (default), dynamic (JS rendering), stealthy (anti-bot).", ) parser.add_argument( "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30).", ) parser.add_argument( "--all", action="store_true", help="Return all matches. Default returns first match only.", ) parser.add_argument( "--pretty", action="store_true", help="Pretty-print JSON output.", ) return parser def _first_or_all(result: Any, want_all: bool) -> Any: if want_all: return result if isinstance(result, list): return result[0] if result else None return result def _fetch_url(url: str, fetcher_type: str, timeout: int) -> Any: try: if fetcher_type == "dynamic": from scrapling.fetchers import DynamicFetcher # type: ignore log.info("Fetching with DynamicFetcher: %s", url) page = DynamicFetcher().fetch(url, timeout=timeout * 1000) elif fetcher_type == "stealthy": from scrapling.fetchers import StealthyFetcher # type: ignore log.info("Fetching with StealthyFetcher: %s", url) page = StealthyFetcher().fetch(url, timeout=timeout * 1000) else: from scrapling.fetchers import Fetcher # type: ignore log.info("Fetching with Fetcher (auto_match): %s", url) page = Fetcher.auto_match(url, auto_save=True, disable_adaptive=False) except ImportError: print( "Scrapling fetchers not installed. Run: pip install 'scrapling[fetchers]' && scrapling install", file=sys.stderr, ) sys.exit(2) log.info("Fetch complete: %s", url) return page def main() -> int: parser = _make_parser() args = parser.parse_args() if not args.css and not args.xpath: parser.error("Provide at least one selector with --css or --xpath.") try: if args.url: page = _fetch_url(args.url, args.fetcher, args.timeout) else: from scrapling import Adaptor # type: ignore html_path = Path(args.html_file) if not html_path.exists(): parser.error(f"HTML file not found: {html_path}") log.info("Parsing local file: %s", html_path) page = Adaptor(html_path.read_text(encoding="utf-8")) except ImportError: print("Scrapling is not installed. Run: pip install scrapling", file=sys.stderr) return 2 except Exception as exc: log.error("Failed to fetch/parse: %s", exc) return 1 output: dict[str, Any] = {} if args.css: css_data = page.css(args.css).getall() if args.all else page.css_first(args.css) result = _first_or_all(css_data, args.all) match_count = len(result) if isinstance(result, list) else (0 if result is None else 1) log.info("CSS '%s' → %d match(es)", args.css, match_count) if result is None: log.warning("CSS selector '%s' returned no matches", args.css) output["css"] = result if args.xpath: xpath_data = page.xpath(args.xpath).getall() if args.all else page.xpath_first(args.xpath) result = _first_or_all(xpath_data, args.all) match_count = len(result) if isinstance(result, list) else (0 if result is None else 1) log.info("XPath '%s' → %d match(es)", args.xpath, match_count) if result is None: log.warning("XPath selector '%s' returned no matches", args.xpath) output["xpath"] = result if args.pretty: print(json.dumps(output, indent=2, ensure_ascii=True)) else: print(json.dumps(output, separators=(",", ":"), ensure_ascii=True)) return 0 if __name__ == "__main__": raise SystemExit(main())