{"skill":{"slug":"agent-evaluation1","displayName":"agent-evaluation","summary":"Testing and benchmarking LLM agents including behavioral testing, capability assessment, reliability metrics, and production monitoring—where even top agents...","tags":{"latest":"1.0.0"},"stats":{"comments":0,"downloads":80,"installsAllTime":0,"installsCurrent":0,"stars":0,"versions":1},"createdAt":1776042808840,"updatedAt":1776043009022},"latestVersion":{"version":"1.0.0","createdAt":1776042808840,"changelog":"Agent Evaluation v1.0.0\n\n- Initial release with comprehensive tools for testing and benchmarking LLM agents.\n- Supports behavioral testing, capability assessment, reliability metrics, and production monitoring.\n- Includes clear distinction between benchmark performance and real-world reliability.\n- Provides statistical, behavioral contract, and adversarial testing patterns.\n- Lists common pitfalls (anti-patterns) and key \"sharp edges\" to watch for in agent evaluation.\n- Integrates all agent evaluation LLM calls via SkillBoss API Hub with simple Python example.","license":"MIT-0"},"metadata":null,"owner":{"handle":"abeltennyson","userId":"s174gfyzm9cmcv1mx6226h2s3584rfxt","displayName":"AbelTennyson","image":"https://avatars.githubusercontent.com/u/261241122?v=4"},"moderation":{"isSuspicious":true,"isMalwareBlocked":false,"verdict":"suspicious","reasonCodes":["suspicious.llm_suspicious"],"summary":"Detected: suspicious.llm_suspicious","engineVersion":"v2.2.0","updatedAt":1776043009022}}