Install
openclaw skills install mediapipeon-device ML pipeline framework for vision, text, audio, and LLM inference. Cross-platform deployment to Android, iOS, web, desktop, edge devices, and IoT.
openclaw skills install mediapipeMediaPipe is Google's open-source framework for building on-device machine learning pipelines. It provides cross-platform APIs for vision, text, audio, and LLM inference tasks, plus a low-level graph-based pipeline framework for custom ML workloads.
MediaPipe has two layers:
The Solutions layer consists of:
pip install mediapipe
Latest version as of 2026-05: 0.10.35. The Python package bundles all tasks. Models are downloaded separately at runtime or pre-downloaded.
Add to build.gradle:
implementation 'com.google.mediapipe:tasks-vision:0.10.35'
Replace vision with text, audio, or genai as needed.
npm install @mediapipe/tasks-vision
Available packages: @mediapipe/tasks-vision, @mediapipe/tasks-text, @mediapipe/tasks-audio, @mediapipe/tasks-genai.
pod 'MediaPipeTasksVision'
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
model_path = '/absolute/path/to/blaze_face_short_range.tflite'
base_options = python.BaseOptions(model_asset_path=model_path)
options = vision.FaceDetectorOptions(base_options=base_options)
detector = vision.FaceDetector.create_from_options(options)
image = mp.Image.create_from_file('photo.jpg')
result = detector.detect(image)
for detection in result.detections:
bbox = detection.bounding_box
print(f"Face at x={bbox.origin_x}, y={bbox.origin_y}, "
f"w={bbox.width}, h={bbox.height}, "
f"score={detection.categories[0].score}")
model_path = '/path/to/hand_landmarker.task'
options = vision.HandLandmarkerOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)
image = mp.Image.create_from_file('hands.jpg')
result = detector.detect(image)
for hand_landmarks in result.hand_landmarks:
for lm in hand_landmarks:
print(f"Landmark: x={lm.x}, y={lm.y}, z={lm.z}")
model_path = '/path/to/pose_landmarker_lite.task'
options = vision.PoseLandmarkerOptions(
base_options=python.BaseOptions(model_asset_path=model_path))
detector = vision.PoseLandmarker.create_from_options(options)
image = mp.Image.create_from_file('person.jpg')
result = detector.detect(image)
# result.pose_landmarks is a list of NormalizedLandmark lists (33 landmarks each)
# result.pose_world_landmarks provides 3D world coordinates
model_path = '/path/to/efficientdet_lite0.tflite'
options = vision.ObjectDetectorOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
max_results=5)
detector = vision.ObjectDetector.create_from_options(options)
image = mp.Image.create_from_file('scene.jpg')
result = detector.detect(image)
for detection in result.detections:
print(f"Class: {detection.categories[0].category_name}, "
f"BBox: {detection.bounding_box}")
from mediapipe.tasks.python import text
model_path = '/path/to/text_classifier.tflite'
options = text.TextClassifierOptions(
base_options=python.BaseOptions(model_asset_path=model_path))
classifier = text.TextClassifier.create_from_options(options)
result = classifier.classify("I absolutely loved this movie!")
for category in result.classifications[0].categories:
print(f"{category.category_name}: {category.score:.4f}")
import cv2
import mediapipe as mp
from mediapipe.tasks.python import vision
from mediapipe.framework.formats import landmark_pb2
# ... detect landmarks ...
# Convert result landmarks to NormalizedLandmarkList
hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
hand_landmarks_proto.landmark.extend([
landmark_pb2.NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z)
for lm in result.hand_landmarks[0]
])
# Draw on image
annotated = mp.solutions.drawing_utils.draw_landmarks(
image_rgb,
hand_landmarks_proto,
mp.solutions.hands.HAND_CONNECTIONS,
mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
mp.solutions.drawing_styles.get_default_hand_connections_style()
)
All vision tasks support three running modes: IMAGE, VIDEO, and LIVE_STREAM.
blaze_face_short_range.tflite (2m), blaze_face_full_range.tflite (5m)min_detection_confidence, min_suppression_thresholdface_landmarker.task (478 3D landmarks), face_landmarker_v2_with_blendshapes.tasknum_faces, min_face_detection_confidence, min_tracking_confidence, output_face_blendshapes, output_facial_transformation_matrixeshand_landmarker.tasknum_hands, min_hand_detection_confidence, min_tracking_confidencepose_landmarker_lite.task, pose_landmarker_full.task, pose_landmarker_heavy.tasknum_poses, min_pose_detection_confidence, min_tracking_confidence, output_segmentationsholistic_landmarker.taskmin_face_detection_confidence, min_pose_detection_confidence, min_hand_landmarks_confidence, output_face_blendshapesgesture_recognizer.taskmin_hand_detection_confidence, min_tracking_confidence, canned_gestures_classifier_optionsefficientdet_lite0.tflite through efficientdet_lite2.tflite (COCO 80 classes)max_results, score_threshold, category_allowlist, category_denylistefficientnet_lite0.tflite through efficientnet_lite4.tflite (ImageNet 1k)max_results, score_threshold, category_allowlist/denylistmobilenet_v3_small.tflite, mobilenet_v3_large.tflitel2_normalize, quantizeoutput_category_mask, output_confidence_masksmagic_touch.tflite, sam.tfliteoutput_category_mask, output_confidence_maskstext_classifier.tflite (BERT-based), custom models via Model Makermax_results, score_threshold, category_allowlist/denylistuniversal_sentence_encoder.tflite, bert_embedder.tflitel2_normalize, quantizelanguage_detector.tfliteyamnet.tflite (521 audio event classes), custom modelsmax_results, score_threshold, category_allowlist/denylistAUDIO_CLIPS and AUDIO_STREAM running modesMediaPipe includes on-device LLM inference via MediaPipe Tasks GenAI (as of v0.10.35):
@mediapipe/tasks-genai package for web-based LLM inferenceCustomize pre-trained models with your own data without ML expertise:
pip install mediapipe-model-maker
from mediapipe_model_maker import text_classifier
data = text_classifier.Dataset.from_csv('reviews.csv')
model = text_classifier.create(data)
model.export_model()
Supports customization for text classification, object detection, image classification, and gesture recognition. Model Maker uses transfer learning with a few hundred examples.
For building custom on-device ML pipelines beyond pre-built solutions:
input_stream: "input_video"
output_stream: "output_video"
node {
calculator: "ImageToTensorCalculator"
input_stream: "IMAGE:input_video"
output_stream: "TENSORS:image_tensor"
}
node {
calculator: "InferenceCalculator"
input_stream: "TENSORS:image_tensor"
output_stream: "TENSORS:detection_tensors"
options {
[mediapipe.InferenceCalculatorOptions.ext] {
model_path: "/path/to/model.tflite"
}
}
}
The Framework is not available for Python or web — use MediaPipe Tasks/Solutions for those platforms.
Models are hosted at https://storage.googleapis.com/mediapipe-models/. Download programmatically:
# Python: download helper (if available) or manual curl
# wget https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/latest/hand_landmarker.task
For production, download models ahead of time and bundle with your app.
model_asset_path. Relative paths or pathlib.Path objects may fail..task/.tflite files in src/main/assets/.# IMAGE mode — single image inference
options = vision.FaceDetectorOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=vision.RunningMode.IMAGE)
# VIDEO mode — frame sequence with timestamps
options = vision.FaceDetectorOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=vision.RunningMode.VIDEO)
# result = detector.detect_for_video(image, timestamp_ms)
# LIVE_STREAM mode — async callback-based for camera streams
def on_result(result, image, timestamp):
pass # handle result asynchronously
options = vision.FaceDetectorOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=vision.RunningMode.LIVE_STREAM,
result_callback=on_result)
All task objects implement context manager protocol:
with vision.FaceDetector.create_from_options(options) as detector:
result = detector.detect(image)
# detector is automatically closed
import mediapipe as mp
# From file
image = mp.Image.create_from_file('photo.jpg')
# From numpy array (must be RGB, uint8)
import cv2
cv_image = cv2.imread('photo.jpg')
rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_image)
base_options = python.BaseOptions(
model_asset_path=model_path,
delegate=python.BaseOptions.Delegate.GPU) # or .CPU (default)
try:
detector = vision.FaceDetector.create_from_options(options)
except Exception as e:
print(f"Failed to create detector: {e}")
# Common issues: wrong model path, incompatible model version, missing TFLite runtime
.task bundles (new format) vs legacy .tflite + metadatamp.solutions.hands, mp.solutions.pose, mp.solutions.face_mesh APIs are legacymediapipe.tasks.python.vision.HandLandmarker) supersedes legacy APIsmediapipe.solutions.* namespace; use Tasks for new projects