Files
zCode-CLI-X/src/tools/VisionTool.js
admin e92e9f5b9d feat: add vision, TTS, and browser tools (17 tools total)
- VisionTool: image analysis via Z.AI GLM-4V multimodal API
- TTSTool: text-to-speech via node-edge-tts (free, auto-sends audio to chat)
- BrowserTool: web page content extraction via cheerio (strips HTML, extracts text)
- All 3 wired into tools/index.js + bot tool definitions + handlers
- TTS handler auto-sends generated audio as voice message to chat
2026-05-05 16:52:12 +00:00

80 lines
2.5 KiB
JavaScript

import { logger } from '../utils/logger.js';
import axios from 'axios';
import fs from 'fs-extra';
import path from 'path';
import { fileURLToPath } from 'url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export class VisionTool {
constructor(config = {}) {
this.name = 'vision';
this.description = 'Analyze an image from URL or file path. Returns a detailed description and answers specific questions about the image.';
this.apiClient = config.apiClient || null;
this.model = config.model || 'glm-4v-flash';
}
async execute({ image_url, question }) {
if (!image_url) return '❌ image_url is required.';
const userQuestion = question || 'Describe this image in detail.';
try {
// If it's a local file path, check if it exists
let imageUrl = image_url;
if (!image_url.startsWith('http')) {
const resolved = path.resolve(image_url);
if (!(await fs.pathExists(resolved))) {
return `❌ File not found: ${resolved}`;
}
// For local files, we'd need to base64 encode — for now require URLs
// Z.AI API supports URLs directly
imageUrl = resolved;
}
// Call Z.AI multimodal API (GLM-4V)
const { default: axios } = await import('axios');
const env = (await import('../config/env.js')).default;
const apiKey = env.ZAI_API_KEY;
const baseUrl = env.GLM_BASE_URL || 'https://api.z.ai/api/coding/paas/v4';
const response = await axios.post(`${baseUrl}/chat/completions`, {
model: this.model,
messages: [
{
role: 'user',
content: [
{
type: 'image_url',
image_url: { url: imageUrl },
},
{
type: 'text',
text: userQuestion,
},
],
},
],
max_tokens: 1024,
}, {
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
timeout: 30000,
});
const result = response.data?.choices?.[0]?.message?.content;
if (!result) return '❌ No response from vision model.';
return result;
} catch (error) {
logger.error(`Vision error: ${error.message}`);
if (error.response) {
return `❌ Vision API error ${error.response.status}: ${JSON.stringify(error.response.data?.error || error.response.data)?.substring(0, 200)}`;
}
return `❌ Vision error: ${error.message}`;
}
}
}