From fb2991ba1aadb770e503d303f0eb25baf80e3366 Mon Sep 17 00:00:00 2001 From: Gemini AI Date: Sun, 14 Dec 2025 02:31:53 +0400 Subject: [PATCH] feat: add UI Automation 'find' command for true textual vision --- bin/input.ps1 | 58 ++++++++++++++++++++++++++++++++++++++++++++ bin/opencode-ink.mjs | 13 +++++----- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/bin/input.ps1 b/bin/input.ps1 index 6e676f9..c40508b 100644 --- a/bin/input.ps1 +++ b/bin/input.ps1 @@ -9,6 +9,39 @@ param( # Load required assemblies Add-Type -AssemblyName System.Windows.Forms Add-Type -AssemblyName System.Drawing +Add-Type -AssemblyName UIAutomationClient +Add-Type -AssemblyName UIAutomationTypes + +# ... (Previous code remains) ... + +switch ($Command.ToLower()) { + # ... (Previous cases remain) ... + + "find" { + if ($Params.Count -lt 1) { Write-Error "Usage: find 'Name'"; exit 1 } + $targetName = $Params -join " " + + Write-Host "Searching for UI Element: '$targetName'..." + + $root = [System.Windows.Automation.AutomationElement]::RootElement + $cond = New-Object System.Windows.Automation.PropertyCondition([System.Windows.Automation.AutomationElement]::NameProperty, $targetName) + + # Try finding directly (fast) + $element = $root.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond) + + if ($element) { + $rect = $element.Current.BoundingRectangle + $centerX = [int]($rect.X + ($rect.Width / 2)) + $centerY = [int]($rect.Y + ($rect.Height / 2)) + Write-Host "Found '$targetName' at ($centerX, $centerY)" + Write-Host "Action: mouse $centerX $centerY" + } else { + Write-Host "Element '$targetName' not found visible on desktop." + } + } + + "apps" { + # ... (Rest remains) ... # C# P/Invoke for advanced Input (SendInput is more reliable than SendKeys) $code = @" @@ -97,6 +130,31 @@ switch ($Command.ToLower()) { Write-Host "Screenshot saved to $fullPath" } + "find" { + if ($Params.Count -lt 1) { Write-Error "Usage: find 'Name'"; exit 1 } + $targetName = $Params -join " " + + Write-Host "Searching for UI Element: '$targetName'..." + + $root = [System.Windows.Automation.AutomationElement]::RootElement + $cond = New-Object System.Windows.Automation.PropertyCondition([System.Windows.Automation.AutomationElement]::NameProperty, $targetName) + + # Try finding directly (fast) + $element = $root.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond) + + if ($element) { + $rect = $element.Current.BoundingRectangle + $centerX = [int]($rect.X + ($rect.Width / 2)) + $centerY = [int]($rect.Y + ($rect.Height / 2)) + Write-Host "Found '$targetName' at ($centerX, $centerY)" + + # Auto-selection support return format + Write-Host "COORD:$centerX,$centerY" + } else { + Write-Host "Element '$targetName' not found visible on desktop." + } + } + "apps" { # List all processes with a window title (visible apps) $apps = Get-Process | Where-Object { $_.MainWindowTitle -ne "" } | Select-Object Id, MainWindowTitle diff --git a/bin/opencode-ink.mjs b/bin/opencode-ink.mjs index 7ee5329..03804ad 100644 --- a/bin/opencode-ink.mjs +++ b/bin/opencode-ink.mjs @@ -491,14 +491,15 @@ Use it to control the mouse, keyboard, and "see" the system. ## 👁️ VISION & BLINDNESS PROTOCOL: You are a TEXT-BASED intelligence. You CANNOT see images/screenshots you take. -- **\`input.ps1 screenshot\`**: Creates an image for the **USER** to see. You learn NOTHING from this. +- **\`input.ps1 find "Name"\`**: **TRUE VISION**. Finds a UI element (button/window) by text and tells you where it is. - **\`input.ps1 apps\`**: Your "Eyes" for windows. Returns TEXT list of open apps. -- **\`input.ps1 screen\`**: Your "Eyes" for geometry. Returns TEXT resolution (e.g. 1920x1080). +- **\`input.ps1 screen\`**: Your "Eyes" for geometry. Returns TEXT resolution. -### 📐 THE LAW OF COORDINATES: -Since you cannot see buttons, you MUST calculate them using \`screen\` dimensions. -1. Run \`powershell bin/input.ps1 screen\`. -2. Get Output: \`Width x Height\` (e.g. 1920 x 1200). +### 📐 THE LAW OF ACCURACY: +1. **FIND FIRST**: If you need to click a button, SEARCH FOR IT. + - \`powershell bin/input.ps1 find "Start"\` -> Returns "Found at (30, 1190)". + - **THEN** use those coordinates to click. +2. **FALLBACK**: Only calculate coordinates manually if \`find\` fails. 3. **Start Menu Logic:** Bottom-Left corner. - X = 0 to 50 - Y = Height - 10 (e.g. 1190).