diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 46fb874..b1d65e1 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -58,6 +58,10 @@ Here are all valid language + template combinations: | typescript | openai-computer-use | ts-openai-cua | ts-openai-cua | Yes | OPENAI_API_KEY | | typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | typescript | claude-agent-sdk | ts-claude-agent-sdk | ts-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | +| typescript | yutori-computer-use | ts-yutori-cua | ts-yutori-cua | Yes | YUTORI_API_KEY | + +> **Note:** The `yutori-computer-use` template supports two modes: `computer_use` (default, full VM screenshots) and `playwright` (viewport-only screenshots via CDP). Both modes should be tested. + | python | sample-app | py-sample-app | python-basic | No | - | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | | python | browser-use | py-browser-use | python-bu | Yes | OPENAI_API_KEY | @@ -65,6 +69,11 @@ Here are all valid language + template combinations: | python | openai-computer-use | py-openai-cua | python-openai-cua | Yes | OPENAI_API_KEY | | python | openagi-computer-use | py-openagi-cua | python-openagi-cua | Yes | OAGI_API_KEY | | python | claude-agent-sdk | py-claude-agent-sdk | py-claude-agent-sdk | Yes | ANTHROPIC_API_KEY | +| python | yutori-computer-use | py-yutori-cua | python-yutori-cua | Yes | YUTORI_API_KEY | + +> **Yutori Modes:** +> - `computer_use` (default): Uses Kernel's Computer Controls API with full VM screenshots +> - `playwright`: Uses Playwright via CDP WebSocket for viewport-only screenshots (optimized for n1 model) ### Create Commands @@ -80,6 +89,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n ts-openai-cua -l typescript -t openai-computer-use ../bin/kernel create -n ts-gemini-cua -l typescript -t gemini-computer-use ../bin/kernel create -n ts-claude-agent-sdk -l typescript -t claude-agent-sdk +../bin/kernel create -n ts-yutori-cua -l typescript -t yutori-computer-use # Python templates ../bin/kernel create -n py-sample-app -l python -t sample-app @@ -89,6 +99,7 @@ Run each of these (they are non-interactive when all flags are provided): ../bin/kernel create -n py-openai-cua -l python -t openai-computer-use ../bin/kernel create -n py-openagi-cua -l python -t openagi-computer-use ../bin/kernel create -n py-claude-agent-sdk -l python -t claude-agent-sdk +../bin/kernel create -n py-yutori-cua -l python -t yutori-computer-use ``` ## Step 5: Deploy Each Template @@ -176,6 +187,15 @@ echo "ANTHROPIC_API_KEY=" > .env cd .. ``` +**ts-yutori-cua** (needs YUTORI_API_KEY): + +```bash +cd ts-yutori-cua +echo "YUTORI_API_KEY=" > .env +../bin/kernel deploy index.ts --env-file .env +cd .. +``` + **py-browser-use** (needs OPENAI_API_KEY): ```bash @@ -221,6 +241,15 @@ echo "ANTHROPIC_API_KEY=" > .env cd .. ``` +**py-yutori-cua** (needs YUTORI_API_KEY): + +```bash +cd py-yutori-cua +echo "YUTORI_API_KEY=" > .env +../bin/kernel deploy main.py --env-file .env +cd .. +``` + ## Step 6: Provide Invoke Commands Once all deployments are complete, present the human with these invoke commands to test manually: @@ -235,6 +264,8 @@ kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipe kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -244,11 +275,13 @@ kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://m kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "computer_use"}' +kernel invoke python-yutori-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and drag the 5 items in the To Do and In Progress columns to the Done section of the Kanban board. You are done successfully when the items are dragged to Done. Do not click into the items.", "record_replay": true, "mode": "playwright"}' ``` ## Step 7: Automated Runtime Testing (Optional) -**STOP and ask the human:** "Would you like me to automatically invoke all 15 templates and report back on their runtime status?" +**STOP and ask the human:** "Would you like me to automatically invoke all 19 test cases and report back on their runtime status?" If the human agrees, invoke each template use the Kernel CLI and collect results. Present findings in this format: @@ -268,6 +301,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | ts-openai-cua | ts-openai-cua | | | | ts-gemini-cua | ts-gemini-cua | | | | ts-claude-agent-sdk | ts-claude-agent-sdk | | | +| ts-yutori-cua | ts-yutori-cua | | mode: computer_use | +| ts-yutori-cua | ts-yutori-cua | | mode: playwright | | py-sample-app | python-basic | | | | py-captcha-solver | python-captcha-solver | | | | py-browser-use | python-bu | | | @@ -275,6 +310,8 @@ If the human agrees, invoke each template use the Kernel CLI and collect results | py-openai-cua | python-openai-cua | | | | py-openagi-cua | python-openagi-cua | | | | py-claude-agent-sdk | py-claude-agent-sdk | | | +| py-yutori-cua | python-yutori-cua | | mode: computer_use | +| py-yutori-cua | python-yutori-cua | | mode: playwright | Status values: - **SUCCESS**: App started and returned a result @@ -287,9 +324,9 @@ Notes should include brief error messages for failures or confirmation of succes - [ ] Built CLI with `make build` - [ ] Created QA directory - [ ] Got KERNEL_API_KEY from human -- [ ] Created all 15 template variations -- [ ] Got required API keys from human (OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, OAGI_API_KEY) -- [ ] Deployed all 15 apps +- [ ] Created all 17 template variations +- [ ] Got required API keys from human (OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, OAGI_API_KEY, YUTORI_API_KEY) +- [ ] Deployed all 17 apps - [ ] Provided invoke commands to human for manual testing - [ ] (Optional) Ran automated runtime testing and reviewed results diff --git a/.gitignore b/.gitignore index 900b34b..39ae834 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store kernel + +# QA testing directories +qa-* diff --git a/pkg/create/templates.go b/pkg/create/templates.go index f99c4e6..957d45f 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -18,6 +18,7 @@ const ( TemplateStagehand = "stagehand" TemplateOpenAGIComputerUse = "openagi-computer-use" TemplateClaudeAgentSDK = "claude-agent-sdk" + TemplateYutoriComputerUse = "yutori-computer-use" ) type TemplateInfo struct { @@ -84,6 +85,11 @@ var Templates = map[string]TemplateInfo{ Description: "Implements a Claude Agent SDK browser automation agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, + TemplateYutoriComputerUse: { + Name: "Yutori n1 Computer Use", + Description: "Implements a Yutori n1 computer use agent", + Languages: []string{LanguageTypeScript, LanguagePython}, + }, } // GetSupportedTemplatesForLanguage returns a list of all supported template names for a given language @@ -108,6 +114,8 @@ func GetSupportedTemplatesForLanguage(language string) TemplateKeyValues { return 1 case TemplateGeminiComputerUse: return 2 + case TemplateYutoriComputerUse: + return 3 default: return 10 } @@ -200,6 +208,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'`, }, + TemplateYutoriComputerUse: { + EntryPoint: "index.ts", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + }, }, LanguagePython: { TemplateSampleApp: { @@ -237,6 +250,11 @@ var Commands = map[string]map[string]DeployConfig{ NeedsEnvFile: true, InvokeCommand: `kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'`, }, + TemplateYutoriComputerUse: { + EntryPoint: "main.py", + NeedsEnvFile: true, + InvokeCommand: `kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'`, + }, }, } diff --git a/pkg/templates/python/yutori-computer-use/README.md b/pkg/templates/python/yutori-computer-use/README.md new file mode 100644 index 0000000..2f8ec2f --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/README.md @@ -0,0 +1,65 @@ +# Kernel Python Sample App - Yutori n1 Computer Use + +This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. + +[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Yutori**: [yutori.com](https://yutori.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your YUTORI_API_KEY +kernel deploy main.py --env-file .env +``` + +## Usage + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke python-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. + +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. + +See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. + +## n1 Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left mouse click at coordinates | +| `scroll` | Scroll page in a direction | +| `type` | Type text into focused element | +| `key_press` | Send keyboard input | +| `hover` | Move mouse without clicking | +| `drag` | Click-and-drag operation | +| `wait` | Pause for UI to update | +| `refresh` | Reload current page | +| `go_back` | Navigate back in history | +| `goto_url` | Navigate to a URL | +| `stop` | End task with final answer | + +## Resources + +- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/yutori-computer-use/_gitignore b/pkg/templates/python/yutori-computer-use/_gitignore new file mode 100644 index 0000000..22e9be5 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/_gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.py[cod] +*$py.class +.env +*.log +.venv/ +venv/ diff --git a/pkg/templates/python/yutori-computer-use/loop.py b/pkg/templates/python/yutori-computer-use/loop.py new file mode 100644 index 0000000..236d4ad --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/loop.py @@ -0,0 +1,230 @@ +""" +Yutori n1 Sampling Loop + +Implements the agent loop for Yutori's n1 computer use model. +n1 uses an OpenAI-compatible API with specific conventions: +- Screenshots use role: "observation" (not "user") +- Coordinates are returned in 1000x1000 space and need scaling +- WebP format recommended for screenshots + +Supports two modes: +- computer_use: Uses Kernel's Computer Controls API (full VM screenshots) +- playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + +@see https://docs.yutori.com/reference/n1 +""" + +import json +import re +from typing import Any, Literal, Optional, Protocol + +from kernel import Kernel +from openai import OpenAI + +from tools import ComputerTool, N1Action, ToolResult +from tools.playwright_computer import PlaywrightComputerTool + +# Mode for browser interaction +BrowserMode = Literal["computer_use", "playwright"] + + +class N1ComputerToolProtocol(Protocol): + async def execute(self, action: N1Action) -> ToolResult: + ... + + async def screenshot(self) -> ToolResult: + ... + +# n1 uses its own system prompt - custom prompts may degrade performance +# Per docs: "we generally do not recommend providing custom system prompts" + + +async def sampling_loop( + *, + model: str = "n1-preview-2025-11", + task: str, + api_key: str, + kernel: Kernel, + session_id: str, + cdp_ws_url: Optional[str] = None, + max_tokens: int = 4096, + max_iterations: int = 50, + viewport_width: int = 1200, + viewport_height: int = 800, + mode: BrowserMode = "computer_use", +) -> dict[str, Any]: + """Run the n1 sampling loop until the model returns a stop action or max iterations.""" + client = OpenAI( + api_key=api_key, + base_url="https://api.yutori.com/v1", + ) + + computer_tool: N1ComputerToolProtocol + playwright_tool: Optional[PlaywrightComputerTool] = None + + print(f"Mode requested: {mode!r}, cdp_ws_url available: {cdp_ws_url is not None}") + + if mode == "playwright": + if not cdp_ws_url: + raise ValueError("cdp_ws_url is required for playwright mode") + print(f"Connecting to CDP WebSocket: {cdp_ws_url[:50]}...") + playwright_tool = PlaywrightComputerTool(cdp_ws_url, viewport_width, viewport_height) + await playwright_tool.connect() + computer_tool = playwright_tool + print("Using playwright mode (viewport-only screenshots)") + else: + computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height) + print("Using computer_use mode (Computer Controls API)") + + try: + initial_screenshot = await computer_tool.screenshot() + + conversation_messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": [{"type": "text", "text": task}], + } + ] + + if initial_screenshot.get("base64_image"): + conversation_messages.append({ + "role": "observation", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{initial_screenshot['base64_image']}" + }, + } + ], + }) + + iteration = 0 + final_answer: Optional[str] = None + + while iteration < max_iterations: + iteration += 1 + print(f"\n=== Iteration {iteration} ===") + + try: + response = client.chat.completions.create( + model=model, + messages=conversation_messages, + max_tokens=max_tokens, + temperature=0.3, + ) + except Exception as api_error: + print(f"API call failed: {api_error}") + raise api_error + + if not response.choices or len(response.choices) == 0: + print(f"No choices in response: {response}") + raise ValueError("No choices in API response") + + assistant_message = response.choices[0].message + if not assistant_message: + raise ValueError("No response from model") + + response_content = assistant_message.content or "" + print("Assistant response:", response_content) + + conversation_messages.append({ + "role": "assistant", + "content": response_content, + }) + + parsed = _parse_n1_response(response_content) + + if not parsed or not parsed.get("actions"): + print("No actions found in response, ending loop") + break + + for action in parsed["actions"]: + print(f"Executing action: {action.get('action_type')}", action) + + if action.get("action_type") == "stop": + final_answer = action.get("answer") + print(f"Stop action received, final answer: {final_answer}") + return {"messages": conversation_messages, "final_answer": final_answer} + + scaled_action = _scale_coordinates(action, viewport_width, viewport_height) + + result: ToolResult + try: + result = await computer_tool.execute(scaled_action) + except Exception as e: + print(f"Action failed: {e}") + result = {"error": str(e)} + + if result.get("base64_image") or result.get("output"): + observation_content = [] + + if result.get("output"): + observation_content.append({ + "type": "text", + "text": result["output"], + }) + + if result.get("base64_image"): + observation_content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{result['base64_image']}" + }, + }) + + conversation_messages.append({ + "role": "observation", + "content": observation_content, + }) + elif result.get("error"): + conversation_messages.append({ + "role": "observation", + "content": [{"type": "text", "text": f"Action failed: {result['error']}"}], + }) + + if iteration >= max_iterations: + print("Max iterations reached") + + return { + "messages": conversation_messages, + "final_answer": final_answer, + } + finally: + if playwright_tool: + await playwright_tool.disconnect() + + +def _parse_n1_response(content: str) -> Optional[dict[str, Any]]: + try: + # The response should be JSON + return json.loads(content) + except json.JSONDecodeError: + # Try to extract JSON from the response if it's wrapped in text + json_match = re.search(r'\{[\s\S]*\}', content) + if json_match: + try: + return json.loads(json_match.group(0)) + except json.JSONDecodeError: + print(f"Failed to parse action JSON: {json_match.group(0)}") + return None + + +def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: + scaled = dict(action) + + if "center_coordinates" in scaled and scaled["center_coordinates"]: + coords = scaled["center_coordinates"] + scaled["center_coordinates"] = [ + round((coords[0] / 1000) * viewport_width), + round((coords[1] / 1000) * viewport_height), + ] + + if "start_coordinates" in scaled and scaled["start_coordinates"]: + coords = scaled["start_coordinates"] + scaled["start_coordinates"] = [ + round((coords[0] / 1000) * viewport_width), + round((coords[1] / 1000) * viewport_height), + ] + + return scaled diff --git a/pkg/templates/python/yutori-computer-use/main.py b/pkg/templates/python/yutori-computer-use/main.py new file mode 100644 index 0000000..d909c67 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/main.py @@ -0,0 +1,102 @@ +import os +from typing import Optional, TypedDict + +import kernel +from loop import sampling_loop, BrowserMode +from session import KernelBrowserSession + + +class QueryInput(TypedDict): + query: str + record_replay: Optional[bool] + # Browser interaction mode: + # - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default + # - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + mode: Optional[BrowserMode] + + +class QueryOutput(TypedDict): + result: str + replay_url: Optional[str] + + +api_key = os.getenv("YUTORI_API_KEY") +if not api_key: + raise ValueError("YUTORI_API_KEY is not set") + +app = kernel.App("python-yutori-cua") + + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: QueryInput, +) -> QueryOutput: + """ + Process a user query using Yutori n1 Computer Use with Kernel's browser automation. + + Args: + ctx: Kernel context containing invocation information + payload: An object containing: + - query: The task/query string to process + - record_replay: Optional boolean to enable video replay recording + + Returns: + A dictionary containing: + - result: The result of the sampling loop as a string + - replay_url: URL to view the replay (if recording was enabled) + """ + if not payload or not payload.get("query"): + raise ValueError("Query is required") + + record_replay = payload.get("record_replay", False) + mode: BrowserMode = payload.get("mode") or "computer_use" + + async with KernelBrowserSession( + stealth=True, + record_replay=record_replay, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + loop_result = await sampling_loop( + model="n1-preview-2025-11", + task=payload["query"], + api_key=str(api_key), + kernel=session.kernel, + session_id=str(session.session_id), + cdp_ws_url=session.cdp_ws_url, + viewport_width=session.viewport_width, + viewport_height=session.viewport_height, + mode=mode, + ) + + final_answer = loop_result.get("final_answer") + messages = loop_result.get("messages", []) + + if final_answer: + result = final_answer + else: + # Extract last assistant message + result = _extract_last_assistant_message(messages) + + return { + "result": result, + "replay_url": session.replay_view_url, + } + + +def _extract_last_assistant_message(messages: list) -> str: + import json + + for msg in reversed(messages): + if msg.get("role") == "assistant": + content = msg.get("content") + if isinstance(content, str): + # Try to parse the thoughts from JSON response + try: + parsed = json.loads(content) + if parsed.get("thoughts"): + return parsed["thoughts"] + except json.JSONDecodeError: + return content + return "Task completed" diff --git a/pkg/templates/python/yutori-computer-use/pyproject.toml b/pkg/templates/python/yutori-computer-use/pyproject.toml new file mode 100644 index 0000000..cca32fc --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "python-yutori-cua" +version = "0.1.0" +description = "Kernel reference app for Yutori n1 Computer Use" +requires-python = ">=3.9" +dependencies = [ + "openai>=1.58.0", + "kernel>=0.24.0", + "python-dotenv>=1.2.1", + "playwright>=1.52.0", +] diff --git a/pkg/templates/python/yutori-computer-use/session.py b/pkg/templates/python/yutori-computer-use/session.py new file mode 100644 index 0000000..f4f2d01 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/session.py @@ -0,0 +1,148 @@ +""" +Kernel Browser Session Manager. + +Provides an async context manager for managing Kernel browser lifecycle +with optional video replay recording. +""" + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Optional + +from kernel import Kernel + + +@dataclass +class KernelBrowserSession: + """ + Manages Kernel browser lifecycle as an async context manager. + + Creates a browser session on entry and cleans it up on exit. + Optionally records a video replay of the entire session. + Provides session_id to computer tools. + + Usage: + async with KernelBrowserSession(record_replay=True) as session: + # Use session.session_id and session.kernel for operations + pass + # Browser is automatically cleaned up, replay URL available in session.replay_view_url + """ + + stealth: bool = True + timeout_seconds: int = 300 + + viewport_width: int = 1200 + viewport_height: int = 800 + + # Replay recording options + record_replay: bool = False + replay_grace_period: float = 5.0 # Seconds to wait before stopping replay + + # Set after browser creation + session_id: Optional[str] = field(default=None, init=False) + live_view_url: Optional[str] = field(default=None, init=False) + cdp_ws_url: Optional[str] = field(default=None, init=False) + replay_id: Optional[str] = field(default=None, init=False) + replay_view_url: Optional[str] = field(default=None, init=False) + _kernel: Optional[Kernel] = field(default=None, init=False) + + async def __aenter__(self) -> "KernelBrowserSession": + self._kernel = Kernel() + + browser = self._kernel.browsers.create( + stealth=self.stealth, + timeout_seconds=self.timeout_seconds, + viewport={ + "width": self.viewport_width, + "height": self.viewport_height, + "refresh_rate": 25, + }, + ) + + self.session_id = browser.session_id + self.live_view_url = browser.browser_live_view_url + self.cdp_ws_url = browser.cdp_ws_url + + print(f"Kernel browser created: {self.session_id}") + print(f"Live view URL: {self.live_view_url}") + + # Start replay recording if enabled + if self.record_replay: + try: + await self._start_replay() + except Exception as e: + print(f"Warning: Failed to start replay recording: {e}") + print("Continuing without replay recording.") + + return self + + async def _start_replay(self) -> None: + if not self._kernel or not self.session_id: + return + + print("Starting replay recording...") + replay = self._kernel.browsers.replays.start(self.session_id) + self.replay_id = replay.replay_id + print(f"Replay recording started: {self.replay_id}") + + async def _stop_and_get_replay_url(self) -> None: + if not self._kernel or not self.session_id or not self.replay_id: + return + + print("Stopping replay recording...") + self._kernel.browsers.replays.stop( + replay_id=self.replay_id, + id=self.session_id, + ) + print("Replay recording stopped. Processing video...") + + # Wait a moment for processing + await asyncio.sleep(2) + + # Poll for replay to be ready (with timeout) + max_wait = 60 # seconds + start_time = time.time() + replay_ready = False + + while time.time() - start_time < max_wait: + try: + replays = self._kernel.browsers.replays.list(self.session_id) + for replay in replays: + if replay.replay_id == self.replay_id: + self.replay_view_url = replay.replay_view_url + replay_ready = True + break + if replay_ready: + break + except Exception: + pass + await asyncio.sleep(1) + + if not replay_ready: + print("Warning: Replay may still be processing") + elif self.replay_view_url: + print(f"Replay view URL: {self.replay_view_url}") + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + if self._kernel and self.session_id: + try: + # Stop replay if recording was enabled + if self.record_replay and self.replay_id: + # Wait grace period before stopping to capture final state + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + finally: + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") + + self._kernel = None + + @property + def kernel(self) -> Kernel: + if self._kernel is None: + raise RuntimeError("Session not initialized. Use async with context.") + return self._kernel diff --git a/pkg/templates/python/yutori-computer-use/tools/__init__.py b/pkg/templates/python/yutori-computer-use/tools/__init__.py new file mode 100644 index 0000000..b01c1a2 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/__init__.py @@ -0,0 +1,13 @@ +"""Yutori n1 Computer Tools.""" + +from .base import ToolError, ToolResult +from .computer import ComputerTool, N1Action +from .playwright_computer import PlaywrightComputerTool + +__all__ = [ + "ToolError", + "ToolResult", + "ComputerTool", + "N1Action", + "PlaywrightComputerTool", +] diff --git a/pkg/templates/python/yutori-computer-use/tools/base.py b/pkg/templates/python/yutori-computer-use/tools/base.py new file mode 100644 index 0000000..d12a38e --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/base.py @@ -0,0 +1,17 @@ +"""Base tool types for Yutori n1.""" + +from typing import TypedDict + + +class ToolError(Exception): + """Error raised when a tool execution fails.""" + + def __init__(self, message: str): + self.message = message + super().__init__(message) + + +class ToolResult(TypedDict, total=False): + base64_image: str + output: str + error: str diff --git a/pkg/templates/python/yutori-computer-use/tools/computer.py b/pkg/templates/python/yutori-computer-use/tools/computer.py new file mode 100644 index 0000000..4460161 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/computer.py @@ -0,0 +1,373 @@ +""" +Yutori n1 Computer Tool + +Maps n1 action format to Kernel's Computer Controls API. +""" + +import asyncio +import base64 +import json +from typing import Literal, TypedDict, Optional + +from kernel import Kernel + +from .base import ToolError, ToolResult + +TYPING_DELAY_MS = 12 # Typing delay in milliseconds (used by Kernel API) +# Delays in seconds for asyncio.sleep (matches TypeScript 300ms = 0.3s) +SCREENSHOT_DELAY_S = 0.3 +ACTION_DELAY_S = 0.3 + + +# n1 action types +N1ActionType = Literal[ + "click", + "scroll", + "type", + "key_press", + "hover", + "drag", + "wait", + "refresh", + "go_back", + "goto_url", + "read_texts_and_links", + "stop", +] + + +class N1Action(TypedDict, total=False): + action_type: N1ActionType + center_coordinates: tuple[int, int] | list[int] + start_coordinates: tuple[int, int] | list[int] + direction: Literal["up", "down", "left", "right"] + amount: int + text: str + press_enter_after: bool + clear_before_typing: bool + key_comb: str + url: str + answer: str + + +# Key mappings from Playwright format (n1 output) to xdotool format (Kernel) +KEY_MAP = { + "Enter": "Return", + "Escape": "Escape", + "Backspace": "BackSpace", + "Tab": "Tab", + "Delete": "Delete", + "ArrowUp": "Up", + "ArrowDown": "Down", + "ArrowLeft": "Left", + "ArrowRight": "Right", + "Home": "Home", + "End": "End", + "PageUp": "Page_Up", + "PageDown": "Page_Down", + "F1": "F1", + "F2": "F2", + "F3": "F3", + "F4": "F4", + "F5": "F5", + "F6": "F6", + "F7": "F7", + "F8": "F8", + "F9": "F9", + "F10": "F10", + "F11": "F11", + "F12": "F12", +} + +MODIFIER_MAP = { + "control": "ctrl", + "ctrl": "ctrl", + "alt": "alt", + "shift": "shift", + "meta": "super", + "command": "super", + "cmd": "super", +} + + +class ComputerTool: + def __init__(self, kernel: Kernel, session_id: str, width: int = 1200, height: int = 800): + self.kernel = kernel + self.session_id = session_id + self.width = width + self.height = height + + async def execute(self, action: N1Action) -> ToolResult: + action_type = action.get("action_type") + + handlers = { + "click": self._handle_click, + "scroll": self._handle_scroll, + "type": self._handle_type, + "key_press": self._handle_key_press, + "hover": self._handle_hover, + "drag": self._handle_drag, + "wait": self._handle_wait, + "refresh": self._handle_refresh, + "go_back": self._handle_go_back, + "goto_url": self._handle_goto_url, + "read_texts_and_links": self._handle_read_texts_and_links, + "stop": self._handle_stop, + } + + handler = handlers.get(action_type) + if not handler: + raise ToolError(f"Unknown action type: {action_type}") + + return await handler(action) + + async def _handle_click(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.click_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + button="left", + click_type="click", + num_clicks=1, + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_scroll(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + direction = action.get("direction") + amount = action.get("amount", 3) + + if direction not in ("up", "down", "left", "right"): + raise ToolError(f"Invalid scroll direction: {direction}") + + scroll_delta = amount * 100 + + delta_x = 0 + delta_y = 0 + + if direction == "up": + delta_y = -scroll_delta + elif direction == "down": + delta_y = scroll_delta + elif direction == "left": + delta_x = -scroll_delta + elif direction == "right": + delta_x = scroll_delta + + self.kernel.browsers.computer.scroll( + self.session_id, + x=coords["x"], + y=coords["y"], + delta_x=delta_x, + delta_y=delta_y, + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_type(self, action: N1Action) -> ToolResult: + text = action.get("text") + if not text: + raise ToolError("text is required for type action") + + if action.get("clear_before_typing"): + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+a"], + ) + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["BackSpace"], + ) + await asyncio.sleep(0.1) + + self.kernel.browsers.computer.type_text( + self.session_id, + text=text, + delay=TYPING_DELAY_MS, + ) + + if action.get("press_enter_after"): + await asyncio.sleep(0.1) + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["Return"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_key_press(self, action: N1Action) -> ToolResult: + key_comb = action.get("key_comb") + if not key_comb: + raise ToolError("key_comb is required for key_press action") + + mapped_key = self._map_key(key_comb) + + self.kernel.browsers.computer.press_key( + self.session_id, + keys=[mapped_key], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_hover(self, action: N1Action) -> ToolResult: + coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.move_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_drag(self, action: N1Action) -> ToolResult: + start_coords = self._get_coordinates(action.get("start_coordinates")) + end_coords = self._get_coordinates(action.get("center_coordinates")) + + self.kernel.browsers.computer.drag_mouse( + self.session_id, + path=[[start_coords["x"], start_coords["y"]], [end_coords["x"], end_coords["y"]]], + button="left", + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_wait(self, action: N1Action) -> ToolResult: + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_refresh(self, action: N1Action) -> ToolResult: + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["F5"], + ) + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_go_back(self, action: N1Action) -> ToolResult: + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["alt+Left"], + ) + await asyncio.sleep(1.5) + return await self.screenshot() + + async def _handle_goto_url(self, action: N1Action) -> ToolResult: + url = action.get("url") + if not url: + raise ToolError("url is required for goto_url action") + + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+l"], + ) + await asyncio.sleep(ACTION_DELAY_S) + + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["ctrl+a"], + ) + await asyncio.sleep(0.1) + + self.kernel.browsers.computer.type_text( + self.session_id, + text=url, + delay=TYPING_DELAY_MS, + ) + await asyncio.sleep(ACTION_DELAY_S) + + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["Return"], + ) + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: + try: + result = self.kernel.browsers.playwright.execute( + self.session_id, + code=""" + const snapshot = await page._snapshotForAI(); + const url = page.url(); + const title = await page.title(); + return { url, title, snapshot }; + """, + timeout_sec=30 + ) + + screenshot_result = await self.screenshot() + + if result.success and result.result: + data = result.result + return { + "base64_image": screenshot_result.get("base64_image", ""), + "output": json.dumps({ + "url": data.get("url"), + "title": data.get("title"), + "snapshot": data.get("snapshot") + }, indent=2) + } + + print("Playwright execution failed, falling back to screenshot only") + return screenshot_result + except Exception as e: + print(f"read_texts_and_links failed: {e}") + return await self.screenshot() + + async def _handle_stop(self, action: N1Action) -> ToolResult: + return {"output": action.get("answer", "Task completed")} + + async def screenshot(self) -> ToolResult: + try: + response = self.kernel.browsers.computer.capture_screenshot( + self.session_id + ) + image_bytes = response.read() + base64_image = base64.b64encode(image_bytes).decode("utf-8") + return {"base64_image": base64_image} + except Exception as e: + raise ToolError(f"Failed to take screenshot: {e}") + + def _get_coordinates( + self, coords: tuple[int, int] | list[int] | None + ) -> dict[str, int]: + if coords is None or len(coords) != 2: + # Default to center of screen + return {"x": self.width // 2, "y": self.height // 2} + + x, y = coords + if not isinstance(x, (int, float)) or not isinstance(y, (int, float)) or x < 0 or y < 0: + raise ToolError(f"Invalid coordinates: {coords}") + + return {"x": int(x), "y": int(y)} + + def _map_key(self, key: str) -> str: + # Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") + if "+" in key: + parts = key.split("+") + mapped_parts = [] + for part in parts: + trimmed = part.strip() + lower = trimmed.lower() + + # Map modifier names + if lower in MODIFIER_MAP: + mapped_parts.append(MODIFIER_MAP[lower]) + else: + # Check KEY_MAP for special keys + mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) + + return "+".join(mapped_parts) + + return KEY_MAP.get(key, key) diff --git a/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py new file mode 100644 index 0000000..df98628 --- /dev/null +++ b/pkg/templates/python/yutori-computer-use/tools/playwright_computer.py @@ -0,0 +1,307 @@ +""" +Yutori n1 Playwright Computer Tool + +Maps n1 action format to Playwright methods via CDP WebSocket connection. +Uses viewport-only screenshots optimized for Yutori n1's training data. + +See: https://docs.yutori.com/reference/n1#screenshot-requirements +""" + +import asyncio +import base64 +import json +from typing import Optional + +from playwright.async_api import async_playwright, Browser, BrowserContext, Page + +from .base import ToolError, ToolResult +from .computer import N1Action + +# Delay after actions before taking screenshot (in seconds for asyncio.sleep) +# Matches TypeScript SCREENSHOT_DELAY_MS = 300 (300ms = 0.3s) +SCREENSHOT_DELAY_S = 0.3 + +# Key mappings from n1 output format to Playwright format +KEY_MAP = { + "Return": "Enter", + "BackSpace": "Backspace", + "Page_Up": "PageUp", + "Page_Down": "PageDown", +} + +MODIFIER_MAP = { + "ctrl": "Control", + "super": "Meta", + "command": "Meta", + "cmd": "Meta", +} + + +class PlaywrightComputerTool: + def __init__(self, cdp_ws_url: str, width: int = 1200, height: int = 800): + self.cdp_ws_url = cdp_ws_url + self.width = width + self.height = height + self._playwright = None + self._browser: Optional[Browser] = None + self._context: Optional[BrowserContext] = None + self._page: Optional[Page] = None + + async def connect(self) -> None: + if self._browser: + return # Already connected + + self._playwright = await async_playwright().start() + self._browser = await self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) + + # Get existing context or create new one + contexts = self._browser.contexts + self._context = contexts[0] if contexts else await self._browser.new_context() + + # Handle new page events + self._context.on("page", self._handle_new_page) + + # Get existing page or create new one + pages = self._context.pages + self._page = pages[0] if pages else await self._context.new_page() + + # Set viewport size to Yutori's recommended dimensions + await self._page.set_viewport_size({"width": self.width, "height": self.height}) + self._page.on("close", self._handle_page_close) + + async def disconnect(self) -> None: + if self._playwright: + await self._playwright.stop() + self._playwright = None + self._browser = None + self._context = None + self._page = None + + def _handle_new_page(self, page: Page) -> None: + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, closed_page: Page) -> None: + print("Page closed") + if self._page == closed_page and self._context: + pages = self._context.pages + if pages: + self._page = pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None + + def _assert_page(self) -> Page: + if not self._page: + raise ToolError("Page not available. Did you call connect()?") + return self._page + + async def execute(self, action: N1Action) -> ToolResult: + action_type = action.get("action_type") + + handlers = { + "click": self._handle_click, + "scroll": self._handle_scroll, + "type": self._handle_type, + "key_press": self._handle_key_press, + "hover": self._handle_hover, + "drag": self._handle_drag, + "wait": self._handle_wait, + "refresh": self._handle_refresh, + "go_back": self._handle_go_back, + "goto_url": self._handle_goto_url, + "read_texts_and_links": self._handle_read_texts_and_links, + "stop": self._handle_stop, + } + + handler = handlers.get(action_type) + if not handler: + raise ToolError(f"Unknown action type: {action_type}") + + return await handler(action) + + async def _handle_click(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.click(coords["x"], coords["y"]) + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_scroll(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + direction = action.get("direction") + amount = action.get("amount", 3) + + if direction not in ("up", "down", "left", "right"): + raise ToolError(f"Invalid scroll direction: {direction}") + + scroll_delta = amount * 100 + + await page.mouse.move(coords["x"], coords["y"]) + + delta_x = 0 + delta_y = 0 + + if direction == "up": + delta_y = -scroll_delta + elif direction == "down": + delta_y = scroll_delta + elif direction == "left": + delta_x = -scroll_delta + elif direction == "right": + delta_x = scroll_delta + + await page.mouse.wheel(delta_x, delta_y) + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_type(self, action: N1Action) -> ToolResult: + page = self._assert_page() + text = action.get("text") + if not text: + raise ToolError("text is required for type action") + + if action.get("clear_before_typing"): + await page.keyboard.press("Control+a") + await asyncio.sleep(0.1) + await page.keyboard.press("Backspace") + await asyncio.sleep(0.1) + + await page.keyboard.type(text) + + if action.get("press_enter_after"): + await asyncio.sleep(0.1) + await page.keyboard.press("Enter") + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_key_press(self, action: N1Action) -> ToolResult: + page = self._assert_page() + key_comb = action.get("key_comb") + if not key_comb: + raise ToolError("key_comb is required for key_press action") + + mapped_key = self._map_key_to_playwright(key_comb) + await page.keyboard.press(mapped_key) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_hover(self, action: N1Action) -> ToolResult: + page = self._assert_page() + coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.move(coords["x"], coords["y"]) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_drag(self, action: N1Action) -> ToolResult: + page = self._assert_page() + start_coords = self._get_coordinates(action.get("start_coordinates")) + end_coords = self._get_coordinates(action.get("center_coordinates")) + + await page.mouse.move(start_coords["x"], start_coords["y"]) + await page.mouse.down() + await asyncio.sleep(0.05) + await page.mouse.move(end_coords["x"], end_coords["y"], steps=12) + await page.mouse.up() + + await asyncio.sleep(0.3) + return await self.screenshot() + + async def _handle_wait(self, action: N1Action) -> ToolResult: + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_refresh(self, action: N1Action) -> ToolResult: + page = self._assert_page() + await page.reload() + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_go_back(self, action: N1Action) -> ToolResult: + page = self._assert_page() + await page.go_back() + await asyncio.sleep(1.5) + return await self.screenshot() + + async def _handle_goto_url(self, action: N1Action) -> ToolResult: + page = self._assert_page() + url = action.get("url") + if not url: + raise ToolError("url is required for goto_url action") + + await page.goto(url) + await asyncio.sleep(2) + return await self.screenshot() + + async def _handle_read_texts_and_links(self, action: N1Action) -> ToolResult: + page = self._assert_page() + try: + snapshot = await page.locator("body").aria_snapshot() + url = page.url + title = await page.title() + + screenshot_result = await self.screenshot() + + return { + "base64_image": screenshot_result.get("base64_image", ""), + "output": json.dumps({"url": url, "title": title, "snapshot": snapshot}, indent=2), + } + except Exception as e: + print(f"read_texts_and_links failed: {e}") + return await self.screenshot() + + async def _handle_stop(self, action: N1Action) -> ToolResult: + return {"output": action.get("answer", "Task completed")} + + async def screenshot(self) -> ToolResult: + page = self._assert_page() + try: + buffer = await page.screenshot(full_page=False) + base64_image = base64.b64encode(buffer).decode("utf-8") + return {"base64_image": base64_image} + except Exception as e: + raise ToolError(f"Failed to take screenshot: {e}") + + def get_current_url(self) -> str: + page = self._assert_page() + return page.url + + def _get_coordinates( + self, coords: tuple[int, int] | list[int] | None + ) -> dict[str, int]: + if coords is None or len(coords) != 2: + # Default to center of viewport + return {"x": self.width // 2, "y": self.height // 2} + + x, y = coords + if not isinstance(x, (int, float)) or not isinstance(y, (int, float)) or x < 0 or y < 0: + raise ToolError(f"Invalid coordinates: {coords}") + + return {"x": int(x), "y": int(y)} + + def _map_key_to_playwright(self, key: str) -> str: + # Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") + if "+" in key: + parts = key.split("+") + mapped_parts = [] + for part in parts: + trimmed = part.strip() + lower = trimmed.lower() + + # Map modifier names + if lower in MODIFIER_MAP: + mapped_parts.append(MODIFIER_MAP[lower]) + else: + # Check KEY_MAP for special keys + mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) + + return "+".join(mapped_parts) + + return KEY_MAP.get(key, key) diff --git a/pkg/templates/typescript/yutori-computer-use/README.md b/pkg/templates/typescript/yutori-computer-use/README.md new file mode 100644 index 0000000..625c94d --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/README.md @@ -0,0 +1,65 @@ +# Kernel TypeScript Sample App - Yutori n1 Computer Use + +This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. + +[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. + +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Yutori**: [yutori.com](https://yutori.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your YUTORI_API_KEY +kernel deploy index.ts --env-file .env +``` + +## Usage + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke ts-yutori-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Viewport Configuration + +Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. Kernel's closest supported viewport is **1200×800 at 25Hz**, which this template uses by default. + +> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. The slight width difference (1200 vs 1280) should have minimal impact on accuracy. + +See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. + +## n1 Supported Actions + +| Action | Description | +|--------|-------------| +| `click` | Left mouse click at coordinates | +| `scroll` | Scroll page in a direction | +| `type` | Type text into focused element | +| `key_press` | Send keyboard input | +| `hover` | Move mouse without clicking | +| `drag` | Click-and-drag operation | +| `wait` | Pause for UI to update | +| `refresh` | Reload current page | +| `go_back` | Navigate back in history | +| `goto_url` | Navigate to a URL | +| `stop` | End task with final answer | + +## Resources + +- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/yutori-computer-use/_gitignore b/pkg/templates/typescript/yutori-computer-use/_gitignore new file mode 100644 index 0000000..aa0926a --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/_gitignore @@ -0,0 +1,4 @@ +node_modules/ +dist/ +.env +*.log diff --git a/pkg/templates/typescript/yutori-computer-use/index.ts b/pkg/templates/typescript/yutori-computer-use/index.ts new file mode 100644 index 0000000..afe51ba --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/index.ts @@ -0,0 +1,100 @@ +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { samplingLoop, type BrowserMode } from './loop'; +import { KernelBrowserSession } from './session'; + +const kernel = new Kernel(); + +const app = kernel.app('ts-yutori-cua'); + +interface QueryInput { + query: string; + record_replay?: boolean; + /** + * Browser interaction mode: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) - default + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + */ + mode?: BrowserMode; +} + +interface QueryOutput { + result: string; + replay_url?: string; +} + +// LLM API Keys are set in the environment during `kernel deploy -e YUTORI_API_KEY=XXX` +// See https://www.kernel.sh/docs/launch/deploy#environment-variables +const YUTORI_API_KEY = process.env.YUTORI_API_KEY; + +if (!YUTORI_API_KEY) { + throw new Error('YUTORI_API_KEY is not set'); +} + +app.action( + 'cua-task', + async (ctx: KernelContext, payload?: QueryInput): Promise => { + if (!payload?.query) { + throw new Error('Query is required'); + } + + // Create browser session with optional replay recording + const session = new KernelBrowserSession(kernel, { + stealth: true, + recordReplay: payload.record_replay ?? false, + }); + + await session.start(); + console.log('Kernel browser live view url:', session.liveViewUrl); + + try { + // Run the sampling loop + const mode = payload.mode ?? 'computer_use'; + const { finalAnswer, messages } = await samplingLoop({ + model: 'n1-preview-2025-11', + task: payload.query, + apiKey: YUTORI_API_KEY, + kernel, + sessionId: session.sessionId, + cdpWsUrl: session.cdpWsUrl ?? undefined, + viewportWidth: session.viewportWidth, + viewportHeight: session.viewportHeight, + mode, + }); + + // Extract the result + const result = finalAnswer || extractLastAssistantMessage(messages); + + // Stop session and get replay URL if recording was enabled + const sessionInfo = await session.stop(); + + return { + result, + replay_url: sessionInfo.replayViewUrl, + }; + } catch (error) { + console.error('Error in sampling loop:', error); + await session.stop(); + throw error; + } + }, +); + +function extractLastAssistantMessage(messages: { role: string; content: string | unknown[] }[]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === 'assistant') { + if (typeof msg.content === 'string') { + // Try to parse the thoughts from JSON response + try { + const parsed = JSON.parse(msg.content); + if (parsed.thoughts) { + return parsed.thoughts; + } + } catch { + return msg.content; + } + } + } + } + return 'Task completed'; +} diff --git a/pkg/templates/typescript/yutori-computer-use/loop.ts b/pkg/templates/typescript/yutori-computer-use/loop.ts new file mode 100644 index 0000000..351aa9c --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/loop.ts @@ -0,0 +1,286 @@ +/** + * Yutori n1 Sampling Loop + * + * Implements the agent loop for Yutori's n1 computer use model. + * n1 uses an OpenAI-compatible API with specific conventions: + * - Screenshots use role: "observation" (not "user") + * - Coordinates are returned in 1000x1000 space and need scaling + * - WebP format recommended for screenshots + * + * Supports two modes: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + * + * @see https://docs.yutori.com/reference/n1 + */ + +import OpenAI from 'openai'; +import type { Kernel } from '@onkernel/sdk'; +import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; +import { PlaywrightComputerTool } from './tools/playwright-computer'; + +/** Mode for browser interaction */ +export type BrowserMode = 'computer_use' | 'playwright'; + +interface N1ComputerTool { + execute(action: N1Action): Promise; + screenshot(): Promise; +} + +// n1 uses its own system prompt - custom prompts may degrade performance +// Per docs: "we generally do not recommend providing custom system prompts" + +interface Message { + role: 'user' | 'assistant' | 'observation'; + content: string | MessageContent[]; +} + +interface MessageContent { + type: 'text' | 'image_url'; + text?: string; + image_url?: { + url: string; + }; +} + +interface SamplingLoopOptions { + model?: string; + task: string; + apiKey: string; + kernel: Kernel; + sessionId: string; + /** CDP WebSocket URL for playwright mode */ + cdpWsUrl?: string; + maxTokens?: number; + maxIterations?: number; + /** Viewport width for coordinate scaling (default: 1200, closest to Yutori's 1280 recommendation) */ + viewportWidth?: number; + /** Viewport height for coordinate scaling (default: 800 per Yutori recommendation) */ + viewportHeight?: number; + /** + * Browser interaction mode: + * - computer_use: Uses Kernel's Computer Controls API (full VM screenshots) + * - playwright: Uses Playwright via CDP (viewport-only screenshots, optimized for n1) + * @default 'computer_use' + */ + mode?: BrowserMode; +} + +interface SamplingLoopResult { + messages: Message[]; + finalAnswer?: string; +} + +export async function samplingLoop({ + model = 'n1-preview-2025-11', + task, + apiKey, + kernel, + sessionId, + cdpWsUrl, + maxTokens = 4096, + maxIterations = 50, + // Default viewport: 1200x800 (closest Kernel-supported size to Yutori's recommended 1280x800) + viewportWidth = 1200, + viewportHeight = 800, + mode = 'computer_use', +}: SamplingLoopOptions): Promise { + const client = new OpenAI({ + apiKey, + baseURL: 'https://api.yutori.com/v1', + }); + + let computerTool: N1ComputerTool; + let playwrightTool: PlaywrightComputerTool | null = null; + + console.log(`Mode requested: '${mode}', cdpWsUrl available: ${cdpWsUrl != null}`); + + if (mode === 'playwright') { + if (!cdpWsUrl) { + throw new Error('cdpWsUrl is required for playwright mode'); + } + console.log(`Connecting to CDP WebSocket: ${cdpWsUrl.substring(0, 50)}...`); + playwrightTool = new PlaywrightComputerTool(cdpWsUrl, viewportWidth, viewportHeight); + await playwrightTool.connect(); + computerTool = playwrightTool; + console.log('Using playwright mode (viewport-only screenshots)'); + } else { + computerTool = new ComputerTool(kernel, sessionId, viewportWidth, viewportHeight); + console.log('Using computer_use mode (Computer Controls API)'); + } + + try { + const initialScreenshot = await computerTool.screenshot(); + + const conversationMessages: Message[] = [ + { + role: 'user', + content: [{ type: 'text', text: task }], + }, + ]; + + if (initialScreenshot.base64Image) { + conversationMessages.push({ + role: 'observation', + content: [ + { + type: 'image_url', + image_url: { + url: `data:image/png;base64,${initialScreenshot.base64Image}`, + }, + }, + ], + }); + } + + let iteration = 0; + let finalAnswer: string | undefined; + + while (iteration < maxIterations) { + iteration++; + console.log(`\n=== Iteration ${iteration} ===`); + + let response; + try { + response = await client.chat.completions.create({ + model, + messages: conversationMessages as OpenAI.ChatCompletionMessageParam[], + max_tokens: maxTokens, + temperature: 0.3, + }); + } catch (apiError) { + console.error('API call failed:', apiError); + throw apiError; + } + + if (!response.choices || response.choices.length === 0) { + console.error('No choices in response:', JSON.stringify(response, null, 2)); + throw new Error('No choices in API response'); + } + + const assistantMessage = response.choices[0]?.message; + if (!assistantMessage) { + throw new Error('No response from model'); + } + + const responseContent = assistantMessage.content || ''; + console.log('Assistant response:', responseContent); + + conversationMessages.push({ + role: 'assistant', + content: responseContent, + }); + + const parsed = parseN1Response(responseContent); + + if (!parsed || !parsed.actions || parsed.actions.length === 0) { + console.log('No actions found in response, ending loop'); + break; + } + + for (const action of parsed.actions) { + console.log('Executing action:', action.action_type, action); + + if (action.action_type === 'stop') { + finalAnswer = action.answer; + console.log('Stop action received, final answer:', finalAnswer); + return { messages: conversationMessages, finalAnswer }; + } + + const scaledAction = scaleCoordinates(action, viewportWidth, viewportHeight); + + let result: ToolResult; + try { + result = await computerTool.execute(scaledAction); + } catch (error) { + console.error('Action failed:', error); + result = { + error: error instanceof Error ? error.message : String(error), + }; + } + + if (result.base64Image || result.output) { + const observationContent: MessageContent[] = []; + + if (result.output) { + observationContent.push({ + type: 'text', + text: result.output, + }); + } + + if (result.base64Image) { + observationContent.push({ + type: 'image_url', + image_url: { + url: `data:image/png;base64,${result.base64Image}`, + }, + }); + } + + conversationMessages.push({ + role: 'observation', + content: observationContent, + }); + } else if (result.error) { + conversationMessages.push({ + role: 'observation', + content: [{ type: 'text', text: `Action failed: ${result.error}` }], + }); + } + } + } + + if (iteration >= maxIterations) { + console.log('Max iterations reached'); + } + + return { + messages: conversationMessages, + finalAnswer, + }; + } finally { + if (playwrightTool) { + await playwrightTool.disconnect(); + } + } +} + +function parseN1Response(content: string): { thoughts?: string; actions?: N1Action[] } | null { + try { + // The response should be JSON + const parsed = JSON.parse(content); + return parsed; + } catch { + // Try to extract JSON from the response if it's wrapped in text + const jsonMatch = content.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + return JSON.parse(jsonMatch[0]); + } catch { + console.error('Failed to parse action JSON:', jsonMatch[0]); + } + } + return null; + } +} + +function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { + const scaled = { ...action }; + + if (scaled.center_coordinates) { + scaled.center_coordinates = [ + Math.round((scaled.center_coordinates[0] / 1000) * viewportWidth), + Math.round((scaled.center_coordinates[1] / 1000) * viewportHeight), + ]; + } + + if (scaled.start_coordinates) { + scaled.start_coordinates = [ + Math.round((scaled.start_coordinates[0] / 1000) * viewportWidth), + Math.round((scaled.start_coordinates[1] / 1000) * viewportHeight), + ]; + } + + return scaled; +} diff --git a/pkg/templates/typescript/yutori-computer-use/package.json b/pkg/templates/typescript/yutori-computer-use/package.json new file mode 100644 index 0000000..2bc4fbe --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/package.json @@ -0,0 +1,15 @@ +{ + "name": "ts-yutori-cua", + "module": "index.ts", + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.24.0", + "openai": "^4.77.0", + "playwright-core": "^1.52.0" + }, + "devDependencies": { + "@types/node": "^22.15.17", + "typescript": "^5.9.3" + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/session.ts b/pkg/templates/typescript/yutori-computer-use/session.ts new file mode 100644 index 0000000..24b1b9b --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/session.ts @@ -0,0 +1,235 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + /** Enable stealth mode to avoid bot detection */ + stealth?: boolean; + /** Browser session timeout in seconds */ + timeoutSeconds?: number; + /** Enable replay recording (requires paid plan) */ + recordReplay?: boolean; + /** Grace period in seconds before stopping replay */ + replayGracePeriod?: number; + /** Viewport width (default: 1280 per Yutori recommendation) */ + viewportWidth?: number; + /** Viewport height (default: 800 per Yutori recommendation) */ + viewportHeight?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + cdpWsUrl: string; + replayId?: string; + replayViewUrl?: string; + viewportWidth: number; + viewportHeight: number; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, + viewportWidth: 1200, + viewportHeight: 800, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _cdpWsUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get cdpWsUrl(): string | null { + return this._cdpWsUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get viewportWidth(): number { + return this.options.viewportWidth; + } + + get viewportHeight(): number { + return this.options.viewportHeight; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + cdpWsUrl: this._cdpWsUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + viewportWidth: this.options.viewportWidth, + viewportHeight: this.options.viewportHeight, + }; + } + + async start(): Promise { + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: this.options.viewportWidth, + height: this.options.viewportHeight, + refresh_rate: 25, + }, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url; + this._cdpWsUrl = browser.cdp_ws_url; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } + } + + return this.info; + } + + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + } finally { + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + } + + // Reset state + this._sessionId = null; + this._liveViewUrl = null; + this._cdpWsUrl = null; + this._replayId = null; + this._replayViewUrl = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/tools/computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts new file mode 100644 index 0000000..46fd76e --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tools/computer.ts @@ -0,0 +1,425 @@ +/** + * Yutori n1 Computer Tool + * + * Maps n1 action format to Kernel's Computer Controls API. + */ + +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; + +const TYPING_DELAY_MS = 12; +const SCREENSHOT_DELAY_MS = 300; +const ACTION_DELAY_MS = 300; + +export interface ToolResult { + base64Image?: string; + output?: string; + error?: string; +} + +export class ToolError extends Error { + constructor(message: string) { + super(message); + this.name = 'ToolError'; + } +} + +// n1 action types +export type N1ActionType = + | 'click' + | 'scroll' + | 'type' + | 'key_press' + | 'hover' + | 'drag' + | 'wait' + | 'refresh' + | 'go_back' + | 'goto_url' + | 'read_texts_and_links' + | 'stop'; + +export interface N1Action { + action_type: N1ActionType; + center_coordinates?: [number, number]; + start_coordinates?: [number, number]; + direction?: 'up' | 'down' | 'left' | 'right'; + amount?: number; + text?: string; + press_enter_after?: boolean; + clear_before_typing?: boolean; + key_comb?: string; + url?: string; + answer?: string; +} + +// Key mappings from Playwright format (n1 output) to xdotool format (Kernel) +const KEY_MAP: Record = { + 'Enter': 'Return', + 'Escape': 'Escape', + 'Backspace': 'BackSpace', + 'Tab': 'Tab', + 'Delete': 'Delete', + 'ArrowUp': 'Up', + 'ArrowDown': 'Down', + 'ArrowLeft': 'Left', + 'ArrowRight': 'Right', + 'Home': 'Home', + 'End': 'End', + 'PageUp': 'Page_Up', + 'PageDown': 'Page_Down', + 'F1': 'F1', + 'F2': 'F2', + 'F3': 'F3', + 'F4': 'F4', + 'F5': 'F5', + 'F6': 'F6', + 'F7': 'F7', + 'F8': 'F8', + 'F9': 'F9', + 'F10': 'F10', + 'F11': 'F11', + 'F12': 'F12', +}; + +const MODIFIER_MAP: Record = { + 'control': 'ctrl', + 'ctrl': 'ctrl', + 'alt': 'alt', + 'shift': 'shift', + 'meta': 'super', + 'command': 'super', + 'cmd': 'super', +}; + +export class ComputerTool { + private kernel: Kernel; + private sessionId: string; + private width: number; + private height: number; + + constructor(kernel: Kernel, sessionId: string, width = 1200, height = 800) { + this.kernel = kernel; + this.sessionId = sessionId; + this.width = width; + this.height = height; + } + + async execute(action: N1Action): Promise { + const { action_type } = action; + + switch (action_type) { + case 'click': + return this.handleClick(action); + case 'scroll': + return this.handleScroll(action); + case 'type': + return this.handleType(action); + case 'key_press': + return this.handleKeyPress(action); + case 'hover': + return this.handleHover(action); + case 'drag': + return this.handleDrag(action); + case 'wait': + return this.handleWait(); + case 'refresh': + return this.handleRefresh(); + case 'go_back': + return this.handleGoBack(); + case 'goto_url': + return this.handleGotoUrl(action); + case 'read_texts_and_links': + return this.handleReadTextsAndLinks(); + case 'stop': + return this.handleStop(action); + default: + throw new ToolError(`Unknown action type: ${action_type}`); + } + } + + private async handleClick(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: coords.x, + y: coords.y, + button: 'left', + click_type: 'click', + num_clicks: 1, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleScroll(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + const direction = action.direction; + const amount = action.amount ?? 3; + + if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { + throw new ToolError(`Invalid scroll direction: ${direction}`); + } + + const scrollDelta = amount * 100; + + let delta_x = 0; + let delta_y = 0; + + switch (direction) { + case 'up': + delta_y = -scrollDelta; + break; + case 'down': + delta_y = scrollDelta; + break; + case 'left': + delta_x = -scrollDelta; + break; + case 'right': + delta_x = scrollDelta; + break; + } + + await this.kernel.browsers.computer.scroll(this.sessionId, { + x: coords.x, + y: coords.y, + delta_x, + delta_y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleType(action: N1Action): Promise { + const text = action.text; + if (!text) { + throw new ToolError('text is required for type action'); + } + + if (action.clear_before_typing) { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+a'], + }); + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['BackSpace'], + }); + await this.sleep(100); + } + + await this.kernel.browsers.computer.typeText(this.sessionId, { + text, + delay: TYPING_DELAY_MS, + }); + + if (action.press_enter_after) { + await this.sleep(100); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + } + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleKeyPress(action: N1Action): Promise { + const keyComb = action.key_comb; + if (!keyComb) { + throw new ToolError('key_comb is required for key_press action'); + } + + const mappedKey = this.mapKey(keyComb); + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [mappedKey], + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleHover(action: N1Action): Promise { + const coords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x: coords.x, + y: coords.y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleDrag(action: N1Action): Promise { + const startCoords = this.getCoordinates(action.start_coordinates); + const endCoords = this.getCoordinates(action.center_coordinates); + + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startCoords.x, startCoords.y], [endCoords.x, endCoords.y]], + button: 'left', + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleWait(): Promise { + await this.sleep(2000); + return this.screenshot(); + } + + private async handleRefresh(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['F5'], + }); + + await this.sleep(2000); + return this.screenshot(); + } + + private async handleGoBack(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Left'], + }); + + await this.sleep(1500); + return this.screenshot(); + } + + private async handleGotoUrl(action: N1Action): Promise { + const url = action.url; + if (!url) { + throw new ToolError('url is required for goto_url action'); + } + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+l'], + }); + await this.sleep(ACTION_DELAY_MS); + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['ctrl+a'], + }); + await this.sleep(100); + + await this.kernel.browsers.computer.typeText(this.sessionId, { + text: url, + delay: TYPING_DELAY_MS, + }); + await this.sleep(ACTION_DELAY_MS); + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['Return'], + }); + + await this.sleep(2000); + return this.screenshot(); + } + + private async handleReadTextsAndLinks(): Promise { + try { + // Get AI snapshot via Playwright Execution API + const result = await this.kernel.browsers.playwright.execute( + this.sessionId, + { + code: ` + const snapshot = await page._snapshotForAI(); + const url = page.url(); + const title = await page.title(); + return { url, title, snapshot }; + `, + timeout_sec: 30 + } + ); + + // Get screenshot via Computer Controls API + const screenshotResult = await this.screenshot(); + + if (result.success && result.result) { + const { url, title, snapshot } = result.result as { + url: string; + title: string; + snapshot: string; + }; + + return { + base64Image: screenshotResult.base64Image, + output: JSON.stringify({ url, title, snapshot }, null, 2) + }; + } + + // Fallback to just screenshot if Playwright execution fails + console.warn('Playwright execution failed, falling back to screenshot only'); + return screenshotResult; + } catch (error) { + console.warn('read_texts_and_links failed:', error); + return this.screenshot(); + } + } + + private handleStop(action: N1Action): ToolResult { + // Return the final answer without taking a screenshot + return { + output: action.answer || 'Task completed', + }; + } + + async screenshot(): Promise { + try { + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + private getCoordinates(coords?: [number, number]): { x: number; y: number } { + if (!coords || coords.length !== 2) { + // Default to center of screen + return { x: this.width / 2, y: this.height / 2 }; + } + + const [x, y] = coords; + if (typeof x !== 'number' || typeof y !== 'number' || x < 0 || y < 0) { + throw new ToolError(`Invalid coordinates: ${JSON.stringify(coords)}`); + } + + return { x, y }; + } + + private mapKey(key: string): string { + // Handle modifier combinations (e.g., "Control+a" -> "ctrl+a") + if (key.includes('+')) { + const parts = key.split('+'); + const mappedParts = parts.map(part => { + const trimmed = part.trim(); + const lower = trimmed.toLowerCase(); + + // Map modifier names + if (MODIFIER_MAP[lower]) { + return MODIFIER_MAP[lower]; + } + + // Check KEY_MAP for special keys + return KEY_MAP[trimmed] || trimmed; + }); + return mappedParts.join('+'); + } + + return KEY_MAP[key] || key; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts new file mode 100644 index 0000000..d6ce229 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tools/playwright-computer.ts @@ -0,0 +1,363 @@ +/** + * Yutori n1 Playwright Computer Tool + * + * Maps n1 action format to Playwright methods via CDP WebSocket connection. + * Uses viewport-only screenshots optimized for Yutori n1's training data. + * + * @see https://docs.yutori.com/reference/n1#screenshot-requirements + */ + +import { chromium, type Browser, type BrowserContext, type Page } from 'playwright-core'; +import type { ToolResult, N1Action } from './computer'; +import { ToolError } from './computer'; + +const SCREENSHOT_DELAY_MS = 300; + +// Key mappings from n1 output format to Playwright format +const KEY_MAP: Record = { + 'Return': 'Enter', + 'BackSpace': 'Backspace', + 'Page_Up': 'PageUp', + 'Page_Down': 'PageDown', +}; + +const MODIFIER_MAP: Record = { + 'ctrl': 'Control', + 'super': 'Meta', + 'command': 'Meta', + 'cmd': 'Meta', +}; + +export class PlaywrightComputerTool { + private cdpWsUrl: string; + private width: number; + private height: number; + private browser: Browser | null = null; + private context: BrowserContext | null = null; + private page: Page | null = null; + + constructor(cdpWsUrl: string, width = 1200, height = 800) { + this.cdpWsUrl = cdpWsUrl; + this.width = width; + this.height = height; + } + + async connect(): Promise { + if (this.browser) { + return; // Already connected + } + + this.browser = await chromium.connectOverCDP(this.cdpWsUrl); + + // Get existing context or create new one + this.context = this.browser.contexts()[0]; + if (!this.context) { + this.context = await this.browser.newContext(); + } + + // Handle new page events + this.context.on('page', this.handleNewPage.bind(this)); + + // Get existing page or create new one + this.page = this.context.pages()[0]; + if (!this.page) { + this.page = await this.context.newPage(); + } + + // Set viewport size to Yutori's recommended dimensions + await this.page.setViewportSize({ width: this.width, height: this.height }); + this.page.on('close', this.handlePageClose.bind(this)); + } + + async disconnect(): Promise { + if (this.browser) { + this.browser = null; + this.context = null; + this.page = null; + } + } + + private handleNewPage(page: Page): void { + console.log('New page created'); + this.page = page; + page.on('close', this.handlePageClose.bind(this)); + } + + private handlePageClose(closedPage: Page): void { + console.log('Page closed'); + if (this.page === closedPage && this.context) { + const pages = this.context.pages(); + if (pages.length > 0) { + this.page = pages[pages.length - 1]; + } else { + console.warn('Warning: All pages have been closed.'); + this.page = null; + } + } + } + + private assertPage(): asserts this is { page: Page } { + if (!this.page) { + throw new ToolError('Page not available. Did you call connect()?'); + } + } + + async execute(action: N1Action): Promise { + this.assertPage(); + const { action_type } = action; + + switch (action_type) { + case 'click': + return this.handleClick(action); + case 'scroll': + return this.handleScroll(action); + case 'type': + return this.handleType(action); + case 'key_press': + return this.handleKeyPress(action); + case 'hover': + return this.handleHover(action); + case 'drag': + return this.handleDrag(action); + case 'wait': + return this.handleWait(); + case 'refresh': + return this.handleRefresh(); + case 'go_back': + return this.handleGoBack(); + case 'goto_url': + return this.handleGotoUrl(action); + case 'read_texts_and_links': + return this.handleReadTextsAndLinks(); + case 'stop': + return this.handleStop(action); + default: + throw new ToolError(`Unknown action type: ${action_type}`); + } + } + + private async handleClick(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.click(coords.x, coords.y); + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleScroll(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + const direction = action.direction; + const amount = action.amount ?? 3; + + if (!direction || !['up', 'down', 'left', 'right'].includes(direction)) { + throw new ToolError(`Invalid scroll direction: ${direction}`); + } + + const scrollDelta = amount * 100; + + await this.page.mouse.move(coords.x, coords.y); + + let deltaX = 0; + let deltaY = 0; + + switch (direction) { + case 'up': + deltaY = -scrollDelta; + break; + case 'down': + deltaY = scrollDelta; + break; + case 'left': + deltaX = -scrollDelta; + break; + case 'right': + deltaX = scrollDelta; + break; + } + + await this.page.mouse.wheel(deltaX, deltaY); + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleType(action: N1Action): Promise { + this.assertPage(); + const text = action.text; + if (!text) { + throw new ToolError('text is required for type action'); + } + + if (action.clear_before_typing) { + await this.page.keyboard.press('Control+a'); + await this.sleep(100); + await this.page.keyboard.press('Backspace'); + await this.sleep(100); + } + + await this.page.keyboard.type(text); + + if (action.press_enter_after) { + await this.sleep(100); + await this.page.keyboard.press('Enter'); + } + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleKeyPress(action: N1Action): Promise { + this.assertPage(); + const keyComb = action.key_comb; + if (!keyComb) { + throw new ToolError('key_comb is required for key_press action'); + } + + const mappedKey = this.mapKeyToPlaywright(keyComb); + await this.page.keyboard.press(mappedKey); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleHover(action: N1Action): Promise { + this.assertPage(); + const coords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.move(coords.x, coords.y); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleDrag(action: N1Action): Promise { + this.assertPage(); + const startCoords = this.getCoordinates(action.start_coordinates); + const endCoords = this.getCoordinates(action.center_coordinates); + + await this.page.mouse.move(startCoords.x, startCoords.y); + await this.page.mouse.down(); + await this.sleep(50); + await this.page.mouse.move(endCoords.x, endCoords.y, { steps: 12 }); + await this.page.mouse.up(); + + await this.sleep(300); + return this.screenshot(); + } + + private async handleWait(): Promise { + await this.sleep(2000); + return this.screenshot(); + } + + private async handleRefresh(): Promise { + this.assertPage(); + await this.page.reload(); + await this.sleep(2000); + return this.screenshot(); + } + + private async handleGoBack(): Promise { + this.assertPage(); + await this.page.goBack(); + await this.sleep(1500); + return this.screenshot(); + } + + private async handleGotoUrl(action: N1Action): Promise { + this.assertPage(); + const url = action.url; + if (!url) { + throw new ToolError('url is required for goto_url action'); + } + + await this.page.goto(url); + await this.sleep(2000); + return this.screenshot(); + } + + private async handleReadTextsAndLinks(): Promise { + this.assertPage(); + try { + const snapshot = await this.page.locator('body').ariaSnapshot(); + const url = this.page.url(); + const title = await this.page.title(); + + const screenshotResult = await this.screenshot(); + + return { + base64Image: screenshotResult.base64Image, + output: JSON.stringify({ url, title, snapshot }, null, 2), + }; + } catch (error) { + console.warn('read_texts_and_links failed:', error); + return this.screenshot(); + } + } + + private handleStop(action: N1Action): ToolResult { + // Return the final answer without taking a screenshot + return { + output: action.answer || 'Task completed', + }; + } + + async screenshot(): Promise { + this.assertPage(); + try { + const buffer = await this.page.screenshot({ fullPage: false }); + + return { + base64Image: buffer.toString('base64'), + }; + } catch (error) { + throw new ToolError(`Failed to take screenshot: ${error}`); + } + } + + getCurrentUrl(): string { + this.assertPage(); + return this.page.url(); + } + + private getCoordinates(coords?: [number, number]): { x: number; y: number } { + if (!coords || coords.length !== 2) { + // Default to center of viewport + return { x: this.width / 2, y: this.height / 2 }; + } + + const [x, y] = coords; + if (typeof x !== 'number' || typeof y !== 'number' || x < 0 || y < 0) { + throw new ToolError(`Invalid coordinates: ${JSON.stringify(coords)}`); + } + + return { x, y }; + } + + private mapKeyToPlaywright(key: string): string { + // Handle modifier combinations (e.g., "ctrl+a" -> "Control+a") + if (key.includes('+')) { + const parts = key.split('+'); + const mappedParts = parts.map((part) => { + const trimmed = part.trim(); + const lower = trimmed.toLowerCase(); + + // Map modifier names + if (MODIFIER_MAP[lower]) { + return MODIFIER_MAP[lower]; + } + + // Check KEY_MAP for special keys + return KEY_MAP[trimmed] || trimmed; + }); + return mappedParts.join('+'); + } + + return KEY_MAP[key] || key; + } + + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/yutori-computer-use/tsconfig.json b/pkg/templates/typescript/yutori-computer-use/tsconfig.json new file mode 100644 index 0000000..13616f5 --- /dev/null +++ b/pkg/templates/typescript/yutori-computer-use/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "." + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules", "dist"] +}