zCode-CLI-X/src/tools/VisionTool.js

import { logger } from '../utils/logger.js';
import axios from 'axios';
import fs from 'fs-extra';
import path from 'path';
import { fileURLToPath } from 'url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

export class VisionTool {
  constructor(config = {}) {
    this.name = 'vision';
    this.description = 'Analyze an image from URL or file path. Returns a detailed description and answers specific questions about the image.';
    this.apiClient = config.apiClient || null;
    this.model = config.model || 'glm-4v-flash';
  }

  async execute({ image_url, question }) {
    if (!image_url) return '❌ image_url is required.';

    const userQuestion = question || 'Describe this image in detail.';

    try {
      // If it's a local file path, check if it exists
      let imageUrl = image_url;
      if (!image_url.startsWith('http')) {
        const resolved = path.resolve(image_url);
        if (!(await fs.pathExists(resolved))) {
          return `❌ File not found: ${resolved}`;
        }
        // For local files, we'd need to base64 encode — for now require URLs
        // Z.AI API supports URLs directly
        imageUrl = resolved;
      }

      // Call Z.AI multimodal API (GLM-4V)
      const { default: axios } = await import('axios');
      const env = (await import('../config/env.js')).default;
      const apiKey = env.ZAI_API_KEY;
      const baseUrl = env.GLM_BASE_URL || 'https://api.z.ai/api/coding/paas/v4';

      const response = await axios.post(`${baseUrl}/chat/completions`, {
        model: this.model,
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'image_url',
                image_url: { url: imageUrl },
              },
              {
                type: 'text',
                text: userQuestion,
              },
            ],
          },
        ],
        max_tokens: 1024,
      }, {
        headers: {
          'Authorization': `Bearer ${apiKey}`,
          'Content-Type': 'application/json',
        },
        timeout: 30000,
      });

      const result = response.data?.choices?.[0]?.message?.content;
      if (!result) return '❌ No response from vision model.';

      return result;
    } catch (error) {
      logger.error(`Vision error: ${error.message}`);
      if (error.response) {
        return `❌ Vision API error ${error.response.status}: ${JSON.stringify(error.response.data?.error || error.response.data)?.substring(0, 200)}`;
      }
      return `❌ Vision error: ${error.message}`;
    }
  }
}