/** * Vision Loop - Automatic Visual Feedback Automation * Implements the "screenshot → LLM → action → repeat" pattern * * Credit: Inspired by AmberSahdev/Open-Interface (https://github.com/AmberSahdev/Open-Interface) * License: MIT * * This module provides: * 1. Screenshot capture * 2. Vision model analysis * 3. Action extraction and execution * 4. Course correction (retry on failure) * 5. Goal completion detection */ import { spawn, execSync } from 'child_process'; import fs from 'fs'; import path from 'path'; // Configuration const CONFIG = { maxIterations: 20, // Maximum steps before giving up screenshotDelay: 500, // ms to wait after action before screenshot actionTimeout: 10000, // ms timeout for each action screenshotDir: 'screenshots', inputScript: 'bin/input.ps1' }; /** * Execute a PowerShell command via input.ps1 */ export async function executeAction(command, args = []) { return new Promise((resolve, reject) => { const fullArgs = [CONFIG.inputScript, command, ...args]; const proc = spawn('powershell', ['-File', ...fullArgs], { cwd: process.cwd(), shell: true }); let stdout = ''; let stderr = ''; proc.stdout.on('data', (data) => { stdout += data.toString(); }); proc.stderr.on('data', (data) => { stderr += data.toString(); }); proc.on('close', (code) => { if (code === 0) { resolve({ success: true, output: stdout.trim() }); } else { resolve({ success: false, output: stdout.trim(), error: stderr.trim() }); } }); proc.on('error', (err) => { reject(err); }); // Timeout setTimeout(() => { proc.kill(); reject(new Error('Action timeout')); }, CONFIG.actionTimeout); }); } /** * Capture screenshot and return path */ export async function captureScreenshot(filename = null) { const dir = path.resolve(CONFIG.screenshotDir); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } const file = filename || `screenshot_${Date.now()}.png`; const fullPath = path.join(dir, file); const result = await executeAction('screenshot', [fullPath]); if (result.success && fs.existsSync(fullPath)) { return fullPath; } throw new Error('Failed to capture screenshot: ' + result.error); } /** * Get list of open applications/windows */ export async function getOpenApps() { const result = await executeAction('apps'); return result.output; } /** * Parse LLM response for actions * Extracts PowerShell commands from code blocks */ export function parseActionsFromResponse(response) { const actions = []; // Match PowerShell code blocks const codeBlockRegex = /```(?:powershell)?\s*([\s\S]*?)```/gi; let match; while ((match = codeBlockRegex.exec(response)) !== null) { const code = match[1].trim(); // Parse individual commands const lines = code.split('\n').filter(l => l.trim() && !l.startsWith('#')); for (const line of lines) { // Extract input.ps1 commands const inputMatch = line.match(/(?:powershell\s+)?(?:\.\\)?bin[\/\\]input\.ps1\s+(\w+)\s*(.*)/i); if (inputMatch) { actions.push({ type: 'input', command: inputMatch[1], args: inputMatch[2] ? inputMatch[2].trim().split(/\s+/) : [] }); } } } return actions; } /** * Check if goal is complete based on LLM response */ export function isGoalComplete(response) { const completionIndicators = [ 'task completed', 'goal achieved', 'successfully completed', 'done', 'finished', 'completed successfully', 'mission accomplished' ]; const lowerResponse = response.toLowerCase(); return completionIndicators.some(indicator => lowerResponse.includes(indicator)); } /** * Vision Loop State Machine */ export class VisionLoop { constructor(options = {}) { this.maxIterations = options.maxIterations || CONFIG.maxIterations; this.onStep = options.onStep || (() => { }); this.onAction = options.onAction || (() => { }); this.onComplete = options.onComplete || (() => { }); this.onError = options.onError || (() => { }); this.sendToLLM = options.sendToLLM || null; // Must be provided this.iteration = 0; this.history = []; this.isRunning = false; } /** * Start the vision loop * @param {string} goal - The user's goal/task description */ async run(goal) { if (!this.sendToLLM) { throw new Error('sendToLLM callback must be provided'); } this.isRunning = true; this.iteration = 0; this.history = []; // Initial context gathering const apps = await getOpenApps(); while (this.isRunning && this.iteration < this.maxIterations) { this.iteration++; try { // Step 1: Capture current state const screenshotPath = await captureScreenshot(`step_${this.iteration}.png`); this.onStep({ iteration: this.iteration, phase: 'capture', screenshot: screenshotPath }); // Step 2: Build context for LLM const context = this.buildContext(goal, apps, screenshotPath); // Step 3: Ask LLM for next action const response = await this.sendToLLM(context); this.history.push({ iteration: this.iteration, context: context.substring(0, 500) + '...', response: response.substring(0, 500) + '...' }); // Step 4: Check if goal is complete if (isGoalComplete(response)) { this.onComplete({ iterations: this.iteration, history: this.history }); this.isRunning = false; return { success: true, iterations: this.iteration }; } // Step 5: Parse and execute actions const actions = parseActionsFromResponse(response); if (actions.length === 0) { // LLM didn't provide actions, might need clarification this.onError({ type: 'no_actions', response: response, iteration: this.iteration }); continue; } for (const action of actions) { this.onAction({ iteration: this.iteration, action: action }); const result = await executeAction(action.command, action.args); if (!result.success) { this.onError({ type: 'action_failed', action: action, error: result.error, iteration: this.iteration }); } // Wait for UI to update await new Promise(resolve => setTimeout(resolve, CONFIG.screenshotDelay)); } } catch (error) { this.onError({ type: 'exception', error: error.message, iteration: this.iteration }); } } if (this.iteration >= this.maxIterations) { return { success: false, reason: 'max_iterations', iterations: this.iteration }; } return { success: false, reason: 'stopped', iterations: this.iteration }; } /** * Build context/prompt for LLM */ buildContext(goal, apps, screenshotPath) { const historyContext = this.history.slice(-3).map(h => `Step ${h.iteration}: ${h.response.substring(0, 200)}...` ).join('\n'); return `# Vision Loop - Autonomous Computer Control Credit: Inspired by AmberSahdev/Open-Interface ## Current Goal ${goal} ## Current State - Iteration: ${this.iteration}/${this.maxIterations} - Screenshot: ${screenshotPath} - Open Applications: ${apps} ## Recent History ${historyContext || 'No previous actions'} ## Instructions 1. Analyze the current state based on the screenshot path and open apps 2. Determine the next action(s) to achieve the goal 3. Provide PowerShell commands using bin/input.ps1 4. If the goal is complete, say "Task completed" ## Available Commands - powershell bin/input.ps1 key LWIN - Press Windows key - powershell bin/input.ps1 uiclick "Element Name" - Click UI element - powershell bin/input.ps1 type "text" - Type text - powershell bin/input.ps1 click - Left click at current position - powershell bin/input.ps1 mouse X Y - Move mouse - powershell bin/input.ps1 apps - List open windows ## Response Format Provide your analysis and commands in PowerShell code blocks: \`\`\`powershell powershell bin/input.ps1 [command] [args] \`\`\` `; } /** * Stop the loop */ stop() { this.isRunning = false; } } /** * Simple one-shot action execution (no loop) */ export async function executeOneShot(commands) { const results = []; for (const cmd of commands) { const result = await executeAction(cmd.command, cmd.args); results.push({ command: cmd, result: result }); if (!result.success) { break; } await new Promise(resolve => setTimeout(resolve, 200)); } return results; } export default { VisionLoop, executeAction, captureScreenshot, getOpenApps, parseActionsFromResponse, isGoalComplete, executeOneShot };