- VisionTool: image analysis via Z.AI GLM-4V multimodal API - TTSTool: text-to-speech via node-edge-tts (free, auto-sends audio to chat) - BrowserTool: web page content extraction via cheerio (strips HTML, extracts text) - All 3 wired into tools/index.js + bot tool definitions + handlers - TTS handler auto-sends generated audio as voice message to chat
80 lines
2.5 KiB
JavaScript
80 lines
2.5 KiB
JavaScript
import { logger } from '../utils/logger.js';
|
|
import axios from 'axios';
|
|
import fs from 'fs-extra';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
export class VisionTool {
|
|
constructor(config = {}) {
|
|
this.name = 'vision';
|
|
this.description = 'Analyze an image from URL or file path. Returns a detailed description and answers specific questions about the image.';
|
|
this.apiClient = config.apiClient || null;
|
|
this.model = config.model || 'glm-4v-flash';
|
|
}
|
|
|
|
async execute({ image_url, question }) {
|
|
if (!image_url) return '❌ image_url is required.';
|
|
|
|
const userQuestion = question || 'Describe this image in detail.';
|
|
|
|
try {
|
|
// If it's a local file path, check if it exists
|
|
let imageUrl = image_url;
|
|
if (!image_url.startsWith('http')) {
|
|
const resolved = path.resolve(image_url);
|
|
if (!(await fs.pathExists(resolved))) {
|
|
return `❌ File not found: ${resolved}`;
|
|
}
|
|
// For local files, we'd need to base64 encode — for now require URLs
|
|
// Z.AI API supports URLs directly
|
|
imageUrl = resolved;
|
|
}
|
|
|
|
// Call Z.AI multimodal API (GLM-4V)
|
|
const { default: axios } = await import('axios');
|
|
const env = (await import('../config/env.js')).default;
|
|
const apiKey = env.ZAI_API_KEY;
|
|
const baseUrl = env.GLM_BASE_URL || 'https://api.z.ai/api/coding/paas/v4';
|
|
|
|
const response = await axios.post(`${baseUrl}/chat/completions`, {
|
|
model: this.model,
|
|
messages: [
|
|
{
|
|
role: 'user',
|
|
content: [
|
|
{
|
|
type: 'image_url',
|
|
image_url: { url: imageUrl },
|
|
},
|
|
{
|
|
type: 'text',
|
|
text: userQuestion,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
max_tokens: 1024,
|
|
}, {
|
|
headers: {
|
|
'Authorization': `Bearer ${apiKey}`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
timeout: 30000,
|
|
});
|
|
|
|
const result = response.data?.choices?.[0]?.message?.content;
|
|
if (!result) return '❌ No response from vision model.';
|
|
|
|
return result;
|
|
} catch (error) {
|
|
logger.error(`Vision error: ${error.message}`);
|
|
if (error.response) {
|
|
return `❌ Vision API error ${error.response.status}: ${JSON.stringify(error.response.data?.error || error.response.data)?.substring(0, 200)}`;
|
|
}
|
|
return `❌ Vision error: ${error.message}`;
|
|
}
|
|
}
|
|
}
|