Install
openclaw skills install wrynai-skillPerform advanced web crawling and content extraction with multi-page crawling, search result parsing, pattern filtering, and screenshot capture using the Wry...
openclaw skills install wrynai-skillThis skill enables OpenClaw to perform advanced web crawling and content extraction using the WrynAI SDK. It provides capabilities for multi-page crawling, content extraction, search engine results parsing, and intelligent data gathering from websites.
# Install the WrynAI SDK
pip install wrynai
# Set your API key as environment variable
export WRYNAI_API_KEY="your-api-key-here"
Sign up at https://wryn.ai to obtain an API key. The key must be set in the WRYNAI_API_KEY environment variable.
Use this when the user wants to crawl an entire website or section of a website.
import os
from wrynai import WrynAI, WrynAIError
def crawl_website(url: str, max_pages: int = 10) -> dict:
"""
Crawl a website starting from the given URL.
Args:
url: Starting URL for the crawl
max_pages: Maximum number of pages to crawl (hard limit: 10)
Returns:
Dictionary containing crawl results with pages and their content
"""
api_key = os.environ.get("WRYNAI_API_KEY")
if not api_key:
raise ValueError("WRYNAI_API_KEY environment variable required")
try:
with WrynAI(api_key=api_key) as client:
result = client.crawl(
url=url,
max_pages=min(max_pages, 10), # Hard limit enforced
max_depth=3,
return_urls=True,
)
return {
"success": result.success,
"total_pages": result.total_pages,
"total_visited": result.total_visited,
"pages": [
{
"url": page.page_url,
"content": page.content,
"urls_found": len(page.urls),
"discovered_urls": page.urls[:10], # First 10 URLs
}
for page in result.pages
],
}
except WrynAIError as e:
return {
"success": False,
"error": str(e),
"status_code": getattr(e, 'status_code', None),
}
When to use:
Specialized crawling for documentation sites with pattern filtering.
from wrynai import WrynAI, Engine
def crawl_documentation(base_url: str, doc_patterns: list = None) -> list:
"""
Crawl documentation sites with targeted URL patterns.
Args:
base_url: Base URL of the documentation site
doc_patterns: List of URL patterns to include (e.g., ["/docs/", "/api/"])
Returns:
List of crawled documentation pages with content
"""
api_key = os.environ.get("WRYNAI_API_KEY")
doc_patterns = doc_patterns or ["/docs/", "/guide/", "/api/", "/reference/"]
with WrynAI(api_key=api_key) as client:
result = client.crawl(
url=base_url,
max_pages=10,
max_depth=3,
include_patterns=doc_patterns,
exclude_patterns=["/internal/", "/draft/", "/changelog/", "/admin/"],
return_urls=True,
timeout_ms=60000, # 60 seconds for documentation crawling
)
return [
{
"url": page.page_url,
"content": page.content,
"word_count": len(page.content.split()),
}
for page in result.pages
]
When to use:
Search for topics and crawl the top results.
from wrynai import WrynAI, CountryCode, WrynAIError
import time
def search_and_crawl(query: str, num_sites: int = 3, country: str = "US") -> list:
"""
Search for a query and crawl the top results.
Args:
query: Search query
num_sites: Number of top results to crawl
country: Country code for search localization
Returns:
List of search results with crawled content
"""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key) as client:
# Step 1: Perform search
try:
search_result = client.search(
query=query,
num_results=num_sites,
country_code=getattr(CountryCode, country, CountryCode.US),
timeout_ms=120000,
)
except WrynAIError as e:
return [{"error": f"Search failed: {str(e)}"}]
# Step 2: Crawl each result
results = []
for result in search_result.organic_results[:num_sites]:
try:
crawl_result = client.crawl(
url=result.url,
max_pages=3,
max_depth=1,
timeout_ms=60000,
)
results.append({
"search_position": result.position,
"title": result.title,
"url": result.url,
"snippet": result.snippet,
"crawled_pages": [
{
"url": page.page_url,
"content_preview": page.content[:500],
"full_content": page.content,
}
for page in crawl_result.pages
],
})
# Rate limiting courtesy
time.sleep(1)
except WrynAIError as e:
results.append({
"title": result.title,
"url": result.url,
"error": str(e),
})
return results
When to use:
Extract specific content types without crawling.
from wrynai import WrynAI, Engine
def extract_page_content(url: str, content_type: str = "text") -> dict:
"""
Extract specific content from a single page.
Args:
url: Target URL
content_type: Type of content to extract
("text", "markdown", "structured", "links", "title")
Returns:
Dictionary with extracted content
"""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key) as client:
try:
if content_type == "text":
result = client.extract_text(url, extract_main_content=True)
return {"url": url, "text": result.text}
elif content_type == "markdown":
result = client.extract_markdown(url, extract_main_content=True)
return {"url": url, "markdown": result.markdown}
elif content_type == "structured":
result = client.extract_structured_text(url)
return {
"url": url,
"main_text": result.main_text,
"headings": [
{"level": h.level, "tag": h.tag, "text": h.text}
for h in result.headings
],
"links": [
{"text": l.text, "url": l.url, "internal": l.internal}
for l in result.links
],
}
elif content_type == "links":
result = client.extract_links(url)
return {
"url": url,
"links": [
{"text": l.text, "url": l.url, "internal": l.internal}
for l in result.links
],
}
elif content_type == "title":
result = client.extract_title(url)
return {"url": url, "title": result.title}
else:
return {"error": f"Unknown content_type: {content_type}"}
except WrynAIError as e:
return {"url": url, "error": str(e)}
When to use:
Production-ready crawling with retry logic and rate limit handling.
from wrynai import WrynAI, RateLimitError, TimeoutError, ServerError, WrynAIError
import time
def robust_crawl(url: str, max_attempts: int = 3, max_pages: int = 10) -> dict:
"""
Crawl with automatic retry and error recovery.
Args:
url: Starting URL
max_attempts: Maximum retry attempts
max_pages: Maximum pages to crawl
Returns:
Crawl results with success status
"""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key, max_retries=3) as client:
for attempt in range(max_attempts):
try:
result = client.crawl(
url=url,
max_pages=max_pages,
max_depth=3,
timeout_ms=60000,
retries=2,
)
return {
"success": True,
"attempt": attempt + 1,
"total_visited": result.total_visited,
"pages": [
{
"url": page.page_url,
"content_length": len(page.content),
"urls_found": len(page.urls),
}
for page in result.pages
],
}
except RateLimitError as e:
wait_time = e.retry_after or (2 ** attempt * 5)
print(f"Rate limited. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
continue
except TimeoutError:
print(f"Timeout on attempt {attempt + 1}. Retrying...")
continue
except ServerError as e:
wait_time = 2 ** attempt
print(f"Server error: {e}. Waiting {wait_time}s...")
time.sleep(wait_time)
continue
except WrynAIError as e:
return {
"success": False,
"error": str(e),
"error_type": type(e).__name__,
"attempt": attempt + 1,
}
return {
"success": False,
"error": "Maximum retry attempts exceeded",
"attempts": max_attempts,
}
When to use:
For single-page applications and JavaScript-rendered content.
from wrynai import WrynAI, Engine
def crawl_spa(url: str, max_pages: int = 5) -> dict:
"""
Crawl single-page applications or JavaScript-heavy sites.
Args:
url: Starting URL
max_pages: Maximum pages to crawl
Returns:
Crawl results with rendered content
"""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key) as client:
result = client.crawl(
url=url,
max_pages=max_pages,
max_depth=2,
engine=Engine.STEALTH_MODE, # Use browser rendering
timeout_ms=90000, # Longer timeout for JS rendering
return_urls=True,
)
return {
"success": result.success,
"total_visited": result.total_visited,
"pages": [
{
"url": page.page_url,
"content": page.content,
"urls_found": len(page.urls),
}
for page in result.pages
],
}
When to use:
# Hard limits enforced by the API
MAX_PAGES = 10 # Maximum pages per crawl
MAX_DEPTH = 3 # Maximum link depth
Engine.SIMPLE # Fast, for static HTML (default)
Engine.STEALTH_MODE # Slower, for JavaScript-rendered content
# Simple scraping: 30,000 ms (30 seconds)
# Crawling: 60,000 ms (60 seconds)
# Search operations: 120,000 ms (2 minutes)
# Smart extraction: 45,000 ms (45 seconds)
# Common patterns for include_patterns
DOCS_PATTERNS = ["/docs/", "/guide/", "/api/", "/reference/"]
BLOG_PATTERNS = ["/blog/", "/posts/", "/articles/"]
# Common patterns for exclude_patterns
EXCLUDE_PATTERNS = ["/admin/", "/login/", "/draft/", "/internal/"]
MEDIA_EXCLUDE = [".pdf", ".jpg", ".png", ".mp4", ".zip"]
from wrynai import (
WrynAIError, # Base exception
AuthenticationError, # Invalid API key (401)
BadRequestError, # Invalid parameters (400)
RateLimitError, # Rate limit exceeded (429)
TimeoutError, # Request timeout
ServerError, # Server error (5xx)
ConnectionError, # Network issue
ValidationError, # Local validation error
)
try:
result = client.crawl(url)
except AuthenticationError:
# Check WRYNAI_API_KEY environment variable
pass
except RateLimitError as e:
# Wait for e.retry_after seconds
time.sleep(e.retry_after or 60)
except TimeoutError:
# Increase timeout_ms parameter
pass
except WrynAIError as e:
# General API error
print(f"Error: {e} (status: {e.status_code})")
import os
api_key = os.environ.get("WRYNAI_API_KEY")
if not api_key:
raise ValueError("WRYNAI_API_KEY environment variable required")
# Recommended - automatic resource cleanup
with WrynAI(api_key=api_key) as client:
result = client.crawl(url)
# Not recommended - manual cleanup required
client = WrynAI(api_key=api_key)
try:
result = client.crawl(url)
finally:
client.close()
# For simple pages
timeout_ms=30000
# For crawling multiple pages
timeout_ms=60000
# For JavaScript-heavy sites
timeout_ms=90000
try:
# Try structured extraction first
result = client.extract_structured_text(url)
content = result.main_text
except Exception:
try:
# Fall back to simple text
result = client.extract_text(url)
content = result.text
except Exception:
content = None
import time
for url in urls:
result = client.crawl(url)
time.sleep(1) # Be nice to the API
Extract structured data from listing pages (e-commerce, directories).
def extract_product_listings(url: str) -> list:
"""Extract product information from listing pages."""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key) as client:
result = client.auto_listing(
url=url,
engine=Engine.STEALTH_MODE,
timeout_ms=60000,
)
return [
{
"title": item.get("title"),
"price": item.get("price"),
"rating": item.get("rating"),
"url": item.get("url"),
}
for item in result.items
]
import base64
from wrynai import ScreenshotType
def capture_page_screenshot(url: str, fullpage: bool = False) -> str:
"""Capture page screenshot and save to file."""
api_key = os.environ.get("WRYNAI_API_KEY")
with WrynAI(api_key=api_key) as client:
result = client.take_screenshot(
url=url,
screenshot_type=ScreenshotType.FULLPAGE if fullpage else ScreenshotType.VIEWPORT,
timeout_ms=30000,
)
# Decode and save
image_data = result.screenshot
if "," in image_data:
image_data = image_data.split(",")[1]
filename = "screenshot.png"
with open(filename, "wb") as f:
f.write(base64.b64decode(image_data))
return filename
"Search for [topic] and crawl the top 5 results"
"Crawl the Python documentation and extract all API references"
"Crawl our old website and extract all blog posts in markdown"
"Find all external links on [website]"
"Crawl [site] and check if [content] is present"
"Crawl [documentation site] and create a searchable knowledge base"
RateLimitError appropriatelyEngine.STEALTH_MODE for SPAs (slower but necessary)Issue: AuthenticationError
WRYNAI_API_KEY environment variable is set correctlyIssue: RateLimitError
e.retry_after wait timeIssue: TimeoutError
timeout_ms parameterIssue: Empty content returned
Engine.STEALTH_MODE for JavaScript-rendered pagesIssue: Missing links/content
exclude_patterns and include_patterns configurationWhen using this skill with OpenClaw:
Set environment variable before running:
export WRYNAI_API_KEY="your-api-key"
Install dependencies:
pip install wrynai
Use in your OpenClaw workflows: