Install
openclaw skills install dekstop-control-linuxSafe Linux desktop automation (mouse/keyboard/screenshot) with approval mode and X11/Wayland checks.
openclaw skills install dekstop-control-linuxSafe desktop automation for Linux using PyAutoGUI with explicit approvals and environment checks.
pyautoguipillowpygetwindow (window ops; not supported on Linux)pyperclip (clipboard ops)opencv-python (optional, image match)System packages (common):
python3-tk, scrot, xclip or xselwmctrl (window list/activate)xdotool (active window)python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=True)
print(dc.get_screen_size())
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
print(dc.screenshot_to('/tmp/screen.png'))
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
print(dc.record_screen('/tmp/record.mp4', seconds=30))
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
dc.open_chrome('http://localhost:8000', wait_seconds=15)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
def preset_open_site():
dc.open_chrome('http://localhost:8000', wait_seconds=15)
def preset_login_site():
dc.open_chrome('http://localhost:8000/login', wait_seconds=15)
dc.login_form('user@example.com', 'password', wait_seconds=10)
dc.register_preset('open-site', preset_open_site)
dc.register_preset('login-site', preset_login_site)
# run presets
# dc.run_preset('open-site')
# dc.run_preset('login-site')
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
steps = [
{"action": "open_chrome", "url": "http://localhost:8000/login", "wait": 15},
{"action": "login_form", "email": "user@example.com", "password": "secret", "wait": 10},
{"action": "open_url", "url": "http://localhost:8000/target", "wait": 15},
{"action": "screenshot", "path": "/tmp/target.png"}
]
dc.run_steps(steps)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Read text from screen
text = dc.read_text_on_screen()
print(text)
# Wait for text to appear (requires pytesseract)
if dc.wait_for_text("Success", timeout=30):
print("Text detected!")
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Get all monitors
monitors = dc.get_monitors()
print(monitors) # [{'name': 'HDMI-1', 'x': 0, 'y': 0, 'width': 1920, 'height': 1080}, ...]
# Click on second monitor (relative 0.5, 0.5 = center)
dc.click_monitor(1, 0.5, 0.5)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Open different browsers
dc.open_firefox('https://google.com', wait_seconds=15)
dc.open_edge('https://github.com', wait_seconds=15)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Resize window to 800x600
dc.resize_window('Chrome', 800, 600)
# Minimize window
dc.minimize_window('Telegram')
# Maximize window
dc.maximize_window('VSCode')
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Start recording
dc.start_recording()
# Do some actions (manual for now, or wrap them)
dc.click(x=100, y=200)
dc.type_text('hello')
dc.press('enter')
# Stop and replay
actions = dc.stop_recording()
print(f"Recorded {len(actions)} actions")
# Replay later
dc.replay_actions(actions, delay_multiplier=1.0)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Find element by color (RGB)
pos = dc.find_element_by_color((255, 0, 0), tolerance=20) # red
if pos:
dc.click(x=pos[0], y=pos[1])
# Smart wait - poll until condition is true
dc.smart_wait(lambda: dc.active_window_contains('Done'), timeout=30)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Drag from point A to B
dc.drag_drop(100, 200, 500, 600)
# Drag file to app
dc.drag_file_to_app('/path/to/file.txt', 400, 300)
PY
python - <<'PY'
from skills.desktop_control_linux import DesktopControllerLinux
dc = DesktopControllerLinux(require_approval=False)
# Click with automatic retry
dc.robust_click(100, 200)
# Type with automatic retry
dc.robust_type("Hello world")
PY
Same interface as DesktopController:
move_mouse, click, drag, scroll, get_mouse_positiontype_text, press, hotkey, wait, launch_app, open_url, open_chrome, wait_retry_window, wait_retry_new_window, smart_retryclick_image, click_image_or, login_formensure_window, active_window_contains, wait_for_text, detect_staterecover_reload, recover_back, retry_with_recoveryrun_stepsregister_preset, run_presetread_text_on_screenget_monitors, click_monitorrobust_click, robust_typesmart_wait, wait_for_window_stabledrag_drop, drag_file_to_appresize_window, minimize_window, maximize_windowopen_firefox, open_edgedetect_keyboard_layoutfind_element_by_color, find_button_visionstart_recording, record_action, stop_recording, replay_actionslaunch_app(app_name, wait_seconds=15, window_title=None, auto_detect_window=True)
window_title is provided: waits 15s, retries once, then errors if not found.auto_detect_window=True: detects a new window title automatically, waits 15s, retries once.smart_retry(action_fn, check_fn, wait_seconds=15, retries=2)
screenshot, screenshot_to, record_screen, get_pixel_color, find_on_screenget_all_windows, activate_window, focus_window_or_click, get_active_windowcopy_to_clipboard, get_from_clipboard/tmp/.X11-unix when DISPLAY is missing