OpenQode/lib/vision-loop.mjs

/**
 * Vision Loop - Automatic Visual Feedback Automation
 * Implements the "screenshot → LLM → action → repeat" pattern
 *
 * Credit: Inspired by AmberSahdev/Open-Interface (https://github.com/AmberSahdev/Open-Interface)
 * License: MIT
 *
 * This module provides:
 * 1. Screenshot capture
 * 2. Vision model analysis
 * 3. Action extraction and execution
 * 4. Course correction (retry on failure)
 * 5. Goal completion detection
 */

import { spawn, execSync } from 'child_process';
import fs from 'fs';
import path from 'path';

// Configuration
const CONFIG = {
    maxIterations: 20,         // Maximum steps before giving up
    screenshotDelay: 500,      // ms to wait after action before screenshot
    actionTimeout: 10000,      // ms timeout for each action
    screenshotDir: 'screenshots',
    inputScript: 'bin/input.ps1'
};

/**
 * Execute a PowerShell command via input.ps1
 */
export async function executeAction(command, args = []) {
    return new Promise((resolve, reject) => {
        const fullArgs = [CONFIG.inputScript, command, ...args];
        const proc = spawn('powershell', ['-File', ...fullArgs], {
            cwd: process.cwd(),
            shell: true
        });

        let stdout = '';
        let stderr = '';

        proc.stdout.on('data', (data) => {
            stdout += data.toString();
        });

        proc.stderr.on('data', (data) => {
            stderr += data.toString();
        });

        proc.on('close', (code) => {
            if (code === 0) {
                resolve({ success: true, output: stdout.trim() });
            } else {
                resolve({ success: false, output: stdout.trim(), error: stderr.trim() });
            }
        });

        proc.on('error', (err) => {
            reject(err);
        });

        // Timeout
        setTimeout(() => {
            proc.kill();
            reject(new Error('Action timeout'));
        }, CONFIG.actionTimeout);
    });
}

/**
 * Capture screenshot and return path
 */
export async function captureScreenshot(filename = null) {
    const dir = path.resolve(CONFIG.screenshotDir);
    if (!fs.existsSync(dir)) {
        fs.mkdirSync(dir, { recursive: true });
    }

    const file = filename || `screenshot_${Date.now()}.png`;
    const fullPath = path.join(dir, file);

    const result = await executeAction('screenshot', [fullPath]);

    if (result.success && fs.existsSync(fullPath)) {
        return fullPath;
    }

    throw new Error('Failed to capture screenshot: ' + result.error);
}

/**
 * Get list of open applications/windows
 */
export async function getOpenApps() {
    const result = await executeAction('apps');
    return result.output;
}

/**
 * Parse LLM response for actions
 * Extracts PowerShell commands from code blocks
 */
export function parseActionsFromResponse(response) {
    const actions = [];

    // Match PowerShell code blocks
    const codeBlockRegex = /```(?:powershell)?\s*([\s\S]*?)```/gi;
    let match;

    while ((match = codeBlockRegex.exec(response)) !== null) {
        const code = match[1].trim();
        // Parse individual commands
        const lines = code.split('\n').filter(l => l.trim() && !l.startsWith('#'));

        for (const line of lines) {
            // Extract input.ps1 commands
            const inputMatch = line.match(/(?:powershell\s+)?(?:\.\\)?bin[\/\\]input\.ps1\s+(\w+)\s*(.*)/i);
            if (inputMatch) {
                actions.push({
                    type: 'input',
                    command: inputMatch[1],
                    args: inputMatch[2] ? inputMatch[2].trim().split(/\s+/) : []
                });
            }
        }
    }

    return actions;
}

/**
 * Check if goal is complete based on LLM response
 */
export function isGoalComplete(response) {
    const completionIndicators = [
        'task completed',
        'goal achieved',
        'successfully completed',
        'done',
        'finished',
        'completed successfully',
        'mission accomplished'
    ];

    const lowerResponse = response.toLowerCase();
    return completionIndicators.some(indicator => lowerResponse.includes(indicator));
}

/**
 * Vision Loop State Machine
 */
export class VisionLoop {
    constructor(options = {}) {
        this.maxIterations = options.maxIterations || CONFIG.maxIterations;
        this.onStep = options.onStep || (() => { });
        this.onAction = options.onAction || (() => { });
        this.onComplete = options.onComplete || (() => { });
        this.onError = options.onError || (() => { });
        this.sendToLLM = options.sendToLLM || null; // Must be provided

        this.iteration = 0;
        this.history = [];
        this.isRunning = false;
    }

    /**
     * Start the vision loop
     * @param {string} goal - The user's goal/task description
     */
    async run(goal) {
        if (!this.sendToLLM) {
            throw new Error('sendToLLM callback must be provided');
        }

        this.isRunning = true;
        this.iteration = 0;
        this.history = [];

        // Initial context gathering
        const apps = await getOpenApps();

        while (this.isRunning && this.iteration < this.maxIterations) {
            this.iteration++;

            try {
                // Step 1: Capture current state
                const screenshotPath = await captureScreenshot(`step_${this.iteration}.png`);

                this.onStep({
                    iteration: this.iteration,
                    phase: 'capture',
                    screenshot: screenshotPath
                });

                // Step 2: Build context for LLM
                const context = this.buildContext(goal, apps, screenshotPath);

                // Step 3: Ask LLM for next action
                const response = await this.sendToLLM(context);

                this.history.push({
                    iteration: this.iteration,
                    context: context.substring(0, 500) + '...',
                    response: response.substring(0, 500) + '...'
                });

                // Step 4: Check if goal is complete
                if (isGoalComplete(response)) {
                    this.onComplete({
                        iterations: this.iteration,
                        history: this.history
                    });
                    this.isRunning = false;
                    return { success: true, iterations: this.iteration };
                }

                // Step 5: Parse and execute actions
                const actions = parseActionsFromResponse(response);

                if (actions.length === 0) {
                    // LLM didn't provide actions, might need clarification
                    this.onError({
                        type: 'no_actions',
                        response: response,
                        iteration: this.iteration
                    });
                    continue;
                }

                for (const action of actions) {
                    this.onAction({
                        iteration: this.iteration,
                        action: action
                    });

                    const result = await executeAction(action.command, action.args);

                    if (!result.success) {
                        this.onError({
                            type: 'action_failed',
                            action: action,
                            error: result.error,
                            iteration: this.iteration
                        });
                    }

                    // Wait for UI to update
                    await new Promise(resolve => setTimeout(resolve, CONFIG.screenshotDelay));
                }

            } catch (error) {
                this.onError({
                    type: 'exception',
                    error: error.message,
                    iteration: this.iteration
                });
            }
        }

        if (this.iteration >= this.maxIterations) {
            return { success: false, reason: 'max_iterations', iterations: this.iteration };
        }

        return { success: false, reason: 'stopped', iterations: this.iteration };
    }

    /**
     * Build context/prompt for LLM
     */
    buildContext(goal, apps, screenshotPath) {
        const historyContext = this.history.slice(-3).map(h =>
            `Step ${h.iteration}: ${h.response.substring(0, 200)}...`
        ).join('\n');

        return `# Vision Loop - Autonomous Computer Control
Credit: Inspired by AmberSahdev/Open-Interface

## Current Goal
${goal}

## Current State
- Iteration: ${this.iteration}/${this.maxIterations}
- Screenshot: ${screenshotPath}
- Open Applications:
${apps}

## Recent History
${historyContext || 'No previous actions'}

## Instructions
1. Analyze the current state based on the screenshot path and open apps
2. Determine the next action(s) to achieve the goal
3. Provide PowerShell commands using bin/input.ps1
4. If the goal is complete, say "Task completed"

## Available Commands
- powershell bin/input.ps1 key LWIN - Press Windows key
- powershell bin/input.ps1 uiclick "Element Name" - Click UI element
- powershell bin/input.ps1 type "text" - Type text
- powershell bin/input.ps1 click - Left click at current position
- powershell bin/input.ps1 mouse X Y - Move mouse
- powershell bin/input.ps1 apps - List open windows

## Response Format
Provide your analysis and commands in PowerShell code blocks:
\`\`\`powershell
powershell bin/input.ps1 [command] [args]
\`\`\`
`;
    }

    /**
     * Stop the loop
     */
    stop() {
        this.isRunning = false;
    }
}

/**
 * Simple one-shot action execution (no loop)
 */
export async function executeOneShot(commands) {
    const results = [];

    for (const cmd of commands) {
        const result = await executeAction(cmd.command, cmd.args);
        results.push({
            command: cmd,
            result: result
        });

        if (!result.success) {
            break;
        }

        await new Promise(resolve => setTimeout(resolve, 200));
    }

    return results;
}

export default {
    VisionLoop,
    executeAction,
    captureScreenshot,
    getOpenApps,
    parseActionsFromResponse,
    isGoalComplete,
    executeOneShot
};