353 lines
10 KiB
JavaScript
353 lines
10 KiB
JavaScript
/**
|
|
* Vision Loop - Automatic Visual Feedback Automation
|
|
* Implements the "screenshot → LLM → action → repeat" pattern
|
|
*
|
|
* Credit: Inspired by AmberSahdev/Open-Interface (https://github.com/AmberSahdev/Open-Interface)
|
|
* License: MIT
|
|
*
|
|
* This module provides:
|
|
* 1. Screenshot capture
|
|
* 2. Vision model analysis
|
|
* 3. Action extraction and execution
|
|
* 4. Course correction (retry on failure)
|
|
* 5. Goal completion detection
|
|
*/
|
|
|
|
import { spawn, execSync } from 'child_process';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
|
|
// Configuration
|
|
const CONFIG = {
|
|
maxIterations: 20, // Maximum steps before giving up
|
|
screenshotDelay: 500, // ms to wait after action before screenshot
|
|
actionTimeout: 10000, // ms timeout for each action
|
|
screenshotDir: 'screenshots',
|
|
inputScript: 'bin/input.ps1'
|
|
};
|
|
|
|
/**
|
|
* Execute a PowerShell command via input.ps1
|
|
*/
|
|
export async function executeAction(command, args = []) {
|
|
return new Promise((resolve, reject) => {
|
|
const fullArgs = [CONFIG.inputScript, command, ...args];
|
|
const proc = spawn('powershell', ['-File', ...fullArgs], {
|
|
cwd: process.cwd(),
|
|
shell: true
|
|
});
|
|
|
|
let stdout = '';
|
|
let stderr = '';
|
|
|
|
proc.stdout.on('data', (data) => {
|
|
stdout += data.toString();
|
|
});
|
|
|
|
proc.stderr.on('data', (data) => {
|
|
stderr += data.toString();
|
|
});
|
|
|
|
proc.on('close', (code) => {
|
|
if (code === 0) {
|
|
resolve({ success: true, output: stdout.trim() });
|
|
} else {
|
|
resolve({ success: false, output: stdout.trim(), error: stderr.trim() });
|
|
}
|
|
});
|
|
|
|
proc.on('error', (err) => {
|
|
reject(err);
|
|
});
|
|
|
|
// Timeout
|
|
setTimeout(() => {
|
|
proc.kill();
|
|
reject(new Error('Action timeout'));
|
|
}, CONFIG.actionTimeout);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Capture screenshot and return path
|
|
*/
|
|
export async function captureScreenshot(filename = null) {
|
|
const dir = path.resolve(CONFIG.screenshotDir);
|
|
if (!fs.existsSync(dir)) {
|
|
fs.mkdirSync(dir, { recursive: true });
|
|
}
|
|
|
|
const file = filename || `screenshot_${Date.now()}.png`;
|
|
const fullPath = path.join(dir, file);
|
|
|
|
const result = await executeAction('screenshot', [fullPath]);
|
|
|
|
if (result.success && fs.existsSync(fullPath)) {
|
|
return fullPath;
|
|
}
|
|
|
|
throw new Error('Failed to capture screenshot: ' + result.error);
|
|
}
|
|
|
|
/**
|
|
* Get list of open applications/windows
|
|
*/
|
|
export async function getOpenApps() {
|
|
const result = await executeAction('apps');
|
|
return result.output;
|
|
}
|
|
|
|
/**
|
|
* Parse LLM response for actions
|
|
* Extracts PowerShell commands from code blocks
|
|
*/
|
|
export function parseActionsFromResponse(response) {
|
|
const actions = [];
|
|
|
|
// Match PowerShell code blocks
|
|
const codeBlockRegex = /```(?:powershell)?\s*([\s\S]*?)```/gi;
|
|
let match;
|
|
|
|
while ((match = codeBlockRegex.exec(response)) !== null) {
|
|
const code = match[1].trim();
|
|
// Parse individual commands
|
|
const lines = code.split('\n').filter(l => l.trim() && !l.startsWith('#'));
|
|
|
|
for (const line of lines) {
|
|
// Extract input.ps1 commands
|
|
const inputMatch = line.match(/(?:powershell\s+)?(?:\.\\)?bin[\/\\]input\.ps1\s+(\w+)\s*(.*)/i);
|
|
if (inputMatch) {
|
|
actions.push({
|
|
type: 'input',
|
|
command: inputMatch[1],
|
|
args: inputMatch[2] ? inputMatch[2].trim().split(/\s+/) : []
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return actions;
|
|
}
|
|
|
|
/**
|
|
* Check if goal is complete based on LLM response
|
|
*/
|
|
export function isGoalComplete(response) {
|
|
const completionIndicators = [
|
|
'task completed',
|
|
'goal achieved',
|
|
'successfully completed',
|
|
'done',
|
|
'finished',
|
|
'completed successfully',
|
|
'mission accomplished'
|
|
];
|
|
|
|
const lowerResponse = response.toLowerCase();
|
|
return completionIndicators.some(indicator => lowerResponse.includes(indicator));
|
|
}
|
|
|
|
/**
|
|
* Vision Loop State Machine
|
|
*/
|
|
export class VisionLoop {
|
|
constructor(options = {}) {
|
|
this.maxIterations = options.maxIterations || CONFIG.maxIterations;
|
|
this.onStep = options.onStep || (() => { });
|
|
this.onAction = options.onAction || (() => { });
|
|
this.onComplete = options.onComplete || (() => { });
|
|
this.onError = options.onError || (() => { });
|
|
this.sendToLLM = options.sendToLLM || null; // Must be provided
|
|
|
|
this.iteration = 0;
|
|
this.history = [];
|
|
this.isRunning = false;
|
|
}
|
|
|
|
/**
|
|
* Start the vision loop
|
|
* @param {string} goal - The user's goal/task description
|
|
*/
|
|
async run(goal) {
|
|
if (!this.sendToLLM) {
|
|
throw new Error('sendToLLM callback must be provided');
|
|
}
|
|
|
|
this.isRunning = true;
|
|
this.iteration = 0;
|
|
this.history = [];
|
|
|
|
// Initial context gathering
|
|
const apps = await getOpenApps();
|
|
|
|
while (this.isRunning && this.iteration < this.maxIterations) {
|
|
this.iteration++;
|
|
|
|
try {
|
|
// Step 1: Capture current state
|
|
const screenshotPath = await captureScreenshot(`step_${this.iteration}.png`);
|
|
|
|
this.onStep({
|
|
iteration: this.iteration,
|
|
phase: 'capture',
|
|
screenshot: screenshotPath
|
|
});
|
|
|
|
// Step 2: Build context for LLM
|
|
const context = this.buildContext(goal, apps, screenshotPath);
|
|
|
|
// Step 3: Ask LLM for next action
|
|
const response = await this.sendToLLM(context);
|
|
|
|
this.history.push({
|
|
iteration: this.iteration,
|
|
context: context.substring(0, 500) + '...',
|
|
response: response.substring(0, 500) + '...'
|
|
});
|
|
|
|
// Step 4: Check if goal is complete
|
|
if (isGoalComplete(response)) {
|
|
this.onComplete({
|
|
iterations: this.iteration,
|
|
history: this.history
|
|
});
|
|
this.isRunning = false;
|
|
return { success: true, iterations: this.iteration };
|
|
}
|
|
|
|
// Step 5: Parse and execute actions
|
|
const actions = parseActionsFromResponse(response);
|
|
|
|
if (actions.length === 0) {
|
|
// LLM didn't provide actions, might need clarification
|
|
this.onError({
|
|
type: 'no_actions',
|
|
response: response,
|
|
iteration: this.iteration
|
|
});
|
|
continue;
|
|
}
|
|
|
|
for (const action of actions) {
|
|
this.onAction({
|
|
iteration: this.iteration,
|
|
action: action
|
|
});
|
|
|
|
const result = await executeAction(action.command, action.args);
|
|
|
|
if (!result.success) {
|
|
this.onError({
|
|
type: 'action_failed',
|
|
action: action,
|
|
error: result.error,
|
|
iteration: this.iteration
|
|
});
|
|
}
|
|
|
|
// Wait for UI to update
|
|
await new Promise(resolve => setTimeout(resolve, CONFIG.screenshotDelay));
|
|
}
|
|
|
|
} catch (error) {
|
|
this.onError({
|
|
type: 'exception',
|
|
error: error.message,
|
|
iteration: this.iteration
|
|
});
|
|
}
|
|
}
|
|
|
|
if (this.iteration >= this.maxIterations) {
|
|
return { success: false, reason: 'max_iterations', iterations: this.iteration };
|
|
}
|
|
|
|
return { success: false, reason: 'stopped', iterations: this.iteration };
|
|
}
|
|
|
|
/**
|
|
* Build context/prompt for LLM
|
|
*/
|
|
buildContext(goal, apps, screenshotPath) {
|
|
const historyContext = this.history.slice(-3).map(h =>
|
|
`Step ${h.iteration}: ${h.response.substring(0, 200)}...`
|
|
).join('\n');
|
|
|
|
return `# Vision Loop - Autonomous Computer Control
|
|
Credit: Inspired by AmberSahdev/Open-Interface
|
|
|
|
## Current Goal
|
|
${goal}
|
|
|
|
## Current State
|
|
- Iteration: ${this.iteration}/${this.maxIterations}
|
|
- Screenshot: ${screenshotPath}
|
|
- Open Applications:
|
|
${apps}
|
|
|
|
## Recent History
|
|
${historyContext || 'No previous actions'}
|
|
|
|
## Instructions
|
|
1. Analyze the current state based on the screenshot path and open apps
|
|
2. Determine the next action(s) to achieve the goal
|
|
3. Provide PowerShell commands using bin/input.ps1
|
|
4. If the goal is complete, say "Task completed"
|
|
|
|
## Available Commands
|
|
- powershell bin/input.ps1 key LWIN - Press Windows key
|
|
- powershell bin/input.ps1 uiclick "Element Name" - Click UI element
|
|
- powershell bin/input.ps1 type "text" - Type text
|
|
- powershell bin/input.ps1 click - Left click at current position
|
|
- powershell bin/input.ps1 mouse X Y - Move mouse
|
|
- powershell bin/input.ps1 apps - List open windows
|
|
|
|
## Response Format
|
|
Provide your analysis and commands in PowerShell code blocks:
|
|
\`\`\`powershell
|
|
powershell bin/input.ps1 [command] [args]
|
|
\`\`\`
|
|
`;
|
|
}
|
|
|
|
/**
|
|
* Stop the loop
|
|
*/
|
|
stop() {
|
|
this.isRunning = false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Simple one-shot action execution (no loop)
|
|
*/
|
|
export async function executeOneShot(commands) {
|
|
const results = [];
|
|
|
|
for (const cmd of commands) {
|
|
const result = await executeAction(cmd.command, cmd.args);
|
|
results.push({
|
|
command: cmd,
|
|
result: result
|
|
});
|
|
|
|
if (!result.success) {
|
|
break;
|
|
}
|
|
|
|
await new Promise(resolve => setTimeout(resolve, 200));
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
export default {
|
|
VisionLoop,
|
|
executeAction,
|
|
captureScreenshot,
|
|
getOpenApps,
|
|
parseActionsFromResponse,
|
|
isGoalComplete,
|
|
executeOneShot
|
|
};
|