feat: Integrated Vision & Robust Translation Layer, Secured Repo (removed keys)

2025-12-15 04:53:51 +04:00
parent a8436c91a3
commit 2407c42eb9
38 changed files with 7786 additions and 3776 deletions
--- a/lib/vision-loop.mjs
+++ b/lib/vision-loop.mjs
@@ -0,0 +1,352 @@
+/**
+ * Vision Loop - Automatic Visual Feedback Automation
+ * Implements the "screenshot → LLM → action → repeat" pattern
+ * 
+ * Credit: Inspired by AmberSahdev/Open-Interface (https://github.com/AmberSahdev/Open-Interface)
+ * License: MIT
+ * 
+ * This module provides:
+ * 1. Screenshot capture
+ * 2. Vision model analysis
+ * 3. Action extraction and execution
+ * 4. Course correction (retry on failure)
+ * 5. Goal completion detection
+ */
+
+import { spawn, execSync } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+
+// Configuration
+const CONFIG = {
+    maxIterations: 20,         // Maximum steps before giving up
+    screenshotDelay: 500,      // ms to wait after action before screenshot
+    actionTimeout: 10000,      // ms timeout for each action
+    screenshotDir: 'screenshots',
+    inputScript: 'bin/input.ps1'
+};
+
+/**
+ * Execute a PowerShell command via input.ps1
+ */
+export async function executeAction(command, args = []) {
+    return new Promise((resolve, reject) => {
+        const fullArgs = [CONFIG.inputScript, command, ...args];
+        const proc = spawn('powershell', ['-File', ...fullArgs], {
+            cwd: process.cwd(),
+            shell: true
+        });
+
+        let stdout = '';
+        let stderr = '';
+
+        proc.stdout.on('data', (data) => {
+            stdout += data.toString();
+        });
+
+        proc.stderr.on('data', (data) => {
+            stderr += data.toString();
+        });
+
+        proc.on('close', (code) => {
+            if (code === 0) {
+                resolve({ success: true, output: stdout.trim() });
+            } else {
+                resolve({ success: false, output: stdout.trim(), error: stderr.trim() });
+            }
+        });
+
+        proc.on('error', (err) => {
+            reject(err);
+        });
+
+        // Timeout
+        setTimeout(() => {
+            proc.kill();
+            reject(new Error('Action timeout'));
+        }, CONFIG.actionTimeout);
+    });
+}
+
+/**
+ * Capture screenshot and return path
+ */
+export async function captureScreenshot(filename = null) {
+    const dir = path.resolve(CONFIG.screenshotDir);
+    if (!fs.existsSync(dir)) {
+        fs.mkdirSync(dir, { recursive: true });
+    }
+
+    const file = filename || `screenshot_${Date.now()}.png`;
+    const fullPath = path.join(dir, file);
+
+    const result = await executeAction('screenshot', [fullPath]);
+
+    if (result.success && fs.existsSync(fullPath)) {
+        return fullPath;
+    }
+
+    throw new Error('Failed to capture screenshot: ' + result.error);
+}
+
+/**
+ * Get list of open applications/windows
+ */
+export async function getOpenApps() {
+    const result = await executeAction('apps');
+    return result.output;
+}
+
+/**
+ * Parse LLM response for actions
+ * Extracts PowerShell commands from code blocks
+ */
+export function parseActionsFromResponse(response) {
+    const actions = [];
+
+    // Match PowerShell code blocks
+    const codeBlockRegex = /```(?:powershell)?\s*([\s\S]*?)```/gi;
+    let match;
+
+    while ((match = codeBlockRegex.exec(response)) !== null) {
+        const code = match[1].trim();
+        // Parse individual commands
+        const lines = code.split('\n').filter(l => l.trim() && !l.startsWith('#'));
+
+        for (const line of lines) {
+            // Extract input.ps1 commands
+            const inputMatch = line.match(/(?:powershell\s+)?(?:\.\\)?bin[\/\\]input\.ps1\s+(\w+)\s*(.*)/i);
+            if (inputMatch) {
+                actions.push({
+                    type: 'input',
+                    command: inputMatch[1],
+                    args: inputMatch[2] ? inputMatch[2].trim().split(/\s+/) : []
+                });
+            }
+        }
+    }
+
+    return actions;
+}
+
+/**
+ * Check if goal is complete based on LLM response
+ */
+export function isGoalComplete(response) {
+    const completionIndicators = [
+        'task completed',
+        'goal achieved',
+        'successfully completed',
+        'done',
+        'finished',
+        'completed successfully',
+        'mission accomplished'
+    ];
+
+    const lowerResponse = response.toLowerCase();
+    return completionIndicators.some(indicator => lowerResponse.includes(indicator));
+}
+
+/**
+ * Vision Loop State Machine
+ */
+export class VisionLoop {
+    constructor(options = {}) {
+        this.maxIterations = options.maxIterations || CONFIG.maxIterations;
+        this.onStep = options.onStep || (() => { });
+        this.onAction = options.onAction || (() => { });
+        this.onComplete = options.onComplete || (() => { });
+        this.onError = options.onError || (() => { });
+        this.sendToLLM = options.sendToLLM || null; // Must be provided
+
+        this.iteration = 0;
+        this.history = [];
+        this.isRunning = false;
+    }
+
+    /**
+     * Start the vision loop
+     * @param {string} goal - The user's goal/task description
+     */
+    async run(goal) {
+        if (!this.sendToLLM) {
+            throw new Error('sendToLLM callback must be provided');
+        }
+
+        this.isRunning = true;
+        this.iteration = 0;
+        this.history = [];
+
+        // Initial context gathering
+        const apps = await getOpenApps();
+
+        while (this.isRunning && this.iteration < this.maxIterations) {
+            this.iteration++;
+
+            try {
+                // Step 1: Capture current state
+                const screenshotPath = await captureScreenshot(`step_${this.iteration}.png`);
+
+                this.onStep({
+                    iteration: this.iteration,
+                    phase: 'capture',
+                    screenshot: screenshotPath
+                });
+
+                // Step 2: Build context for LLM
+                const context = this.buildContext(goal, apps, screenshotPath);
+
+                // Step 3: Ask LLM for next action
+                const response = await this.sendToLLM(context);
+
+                this.history.push({
+                    iteration: this.iteration,
+                    context: context.substring(0, 500) + '...',
+                    response: response.substring(0, 500) + '...'
+                });
+
+                // Step 4: Check if goal is complete
+                if (isGoalComplete(response)) {
+                    this.onComplete({
+                        iterations: this.iteration,
+                        history: this.history
+                    });
+                    this.isRunning = false;
+                    return { success: true, iterations: this.iteration };
+                }
+
+                // Step 5: Parse and execute actions
+                const actions = parseActionsFromResponse(response);
+
+                if (actions.length === 0) {
+                    // LLM didn't provide actions, might need clarification
+                    this.onError({
+                        type: 'no_actions',
+                        response: response,
+                        iteration: this.iteration
+                    });
+                    continue;
+                }
+
+                for (const action of actions) {
+                    this.onAction({
+                        iteration: this.iteration,
+                        action: action
+                    });
+
+                    const result = await executeAction(action.command, action.args);
+
+                    if (!result.success) {
+                        this.onError({
+                            type: 'action_failed',
+                            action: action,
+                            error: result.error,
+                            iteration: this.iteration
+                        });
+                    }
+
+                    // Wait for UI to update
+                    await new Promise(resolve => setTimeout(resolve, CONFIG.screenshotDelay));
+                }
+
+            } catch (error) {
+                this.onError({
+                    type: 'exception',
+                    error: error.message,
+                    iteration: this.iteration
+                });
+            }
+        }
+
+        if (this.iteration >= this.maxIterations) {
+            return { success: false, reason: 'max_iterations', iterations: this.iteration };
+        }
+
+        return { success: false, reason: 'stopped', iterations: this.iteration };
+    }
+
+    /**
+     * Build context/prompt for LLM
+     */
+    buildContext(goal, apps, screenshotPath) {
+        const historyContext = this.history.slice(-3).map(h =>
+            `Step ${h.iteration}: ${h.response.substring(0, 200)}...`
+        ).join('\n');
+
+        return `# Vision Loop - Autonomous Computer Control
+Credit: Inspired by AmberSahdev/Open-Interface
+
+## Current Goal
+${goal}
+
+## Current State
+- Iteration: ${this.iteration}/${this.maxIterations}
+- Screenshot: ${screenshotPath}
+- Open Applications:
+${apps}
+
+## Recent History
+${historyContext || 'No previous actions'}
+
+## Instructions
+1. Analyze the current state based on the screenshot path and open apps
+2. Determine the next action(s) to achieve the goal
+3. Provide PowerShell commands using bin/input.ps1
+4. If the goal is complete, say "Task completed"
+
+## Available Commands
+- powershell bin/input.ps1 key LWIN - Press Windows key
+- powershell bin/input.ps1 uiclick "Element Name" - Click UI element
+- powershell bin/input.ps1 type "text" - Type text
+- powershell bin/input.ps1 click - Left click at current position
+- powershell bin/input.ps1 mouse X Y - Move mouse
+- powershell bin/input.ps1 apps - List open windows
+
+## Response Format
+Provide your analysis and commands in PowerShell code blocks:
+\`\`\`powershell
+powershell bin/input.ps1 [command] [args]
+\`\`\`
+`;
+    }
+
+    /**
+     * Stop the loop
+     */
+    stop() {
+        this.isRunning = false;
+    }
+}
+
+/**
+ * Simple one-shot action execution (no loop)
+ */
+export async function executeOneShot(commands) {
+    const results = [];
+
+    for (const cmd of commands) {
+        const result = await executeAction(cmd.command, cmd.args);
+        results.push({
+            command: cmd,
+            result: result
+        });
+
+        if (!result.success) {
+            break;
+        }
+
+        await new Promise(resolve => setTimeout(resolve, 200));
+    }
+
+    return results;
+}
+
+export default {
+    VisionLoop,
+    executeAction,
+    captureScreenshot,
+    getOpenApps,
+    parseActionsFromResponse,
+    isGoalComplete,
+    executeOneShot
+};