feat: add vision, TTS, and browser tools (17 tools total)

- VisionTool: image analysis via Z.AI GLM-4V multimodal API - TTSTool: text-to-speech via node-edge-tts (free, auto-sends audio to chat) - BrowserTool: web page content extraction via cheerio (strips HTML, extracts text) - All 3 wired into tools/index.js + bot tool definitions + handlers - TTS handler auto-sends generated audio as voice message to chat
2026-05-05 16:52:12 +00:00
parent d7f1e3db90
commit e92e9f5b9d
7 changed files with 793 additions and 0 deletions
--- a/src/bot/index.js
+++ b/src/bot/index.js
@@ -311,6 +311,28 @@ export async function initBot(config, api, tools, skills, agents) {
          command: { type: 'string', description: 'Command to run' },
        }, required: ['action'] },
      },
+      vision: {
+        description: 'Analyze an image from URL or file path. Returns detailed description and answers questions about the image.',
+        parameters: { type: 'object', properties: {
+          image_url: { type: 'string', description: 'Image URL (http/https) or local file path to analyze' },
+          question: { type: 'string', description: 'Specific question about the image (optional, defaults to full description)' },
+        }, required: ['image_url'] },
+      },
+      tts: {
+        description: 'Convert text to speech audio. Generates an MP3 file using Edge TTS (free, no API key needed).',
+        parameters: { type: 'object', properties: {
+          text: { type: 'string', description: 'Text to convert to speech (max 5000 chars)' },
+          voice: { type: 'string', description: 'Voice name (default: en-US-AvaNeural)' },
+          output_path: { type: 'string', description: 'Output file path (optional)' },
+        }, required: ['text'] },
+      },
+      browser: {
+        description: 'Fetch and extract readable content from a web page URL. Returns title, description, and main text content.',
+        parameters: { type: 'object', properties: {
+          url: { type: 'string', description: 'URL to fetch and extract content from' },
+          selector: { type: 'string', description: 'CSS selector for content extraction (optional, auto-detects article/main)' },
+        }, required: ['url'] },
+      },
      delegate_agent: {
        description: 'Delegate to a specialized agent role',
        parameters: { type: 'object', properties: {
@@ -560,6 +582,39 @@ export async function initBot(config, api, tools, skills, agents) {
      if (!tool) return '❌ Cron tool unavailable.';
      try { return await tool.execute(args); } catch (e) { return `❌ ${e.message}`; }
    },
+    vision: async (args) => {
+      const tool = svc.toolMap.get('vision');
+      if (!tool) return '❌ Vision tool unavailable.';
+      try { return await tool.execute(args); } catch (e) { return `❌ ${e.message}`; }
+    },
+    tts: async (args) => {
+      const tool = svc.toolMap.get('tts');
+      if (!tool) return '❌ TTS tool unavailable.';
+      try {
+        const result = await tool.execute(args);
+        // If audio was generated, send it as a voice message
+        if (result.startsWith('✅')) {
+          const filePath = result.match(/saved:\s*(.+)/)?.[1]?.trim();
+          if (filePath) {
+            try {
+              await svc.bot.api.sendAudio(svc.currentChatId, { source: filePath }, {
+                caption: '🔊 TTS',
+                performer: 'zCode',
+              });
+              return '✅ Audio sent as voice message.';
+            } catch (sendErr) {
+              return `${result}\n⚠ Could not auto-send audio: ${sendErr.message}`;
+            }
+          }
+        }
+        return result;
+      } catch (e) { return `❌ ${e.message}`; }
+    },
+    browser: async (args) => {
+      const tool = svc.toolMap.get('browser');
+      if (!tool) return '❌ Browser tool unavailable.';
+      try { return await tool.execute(args); } catch (e) { return `❌ ${e.message}`; }
+    },
    delegate_agent: async (args) => {
      const agent = svc.agents.find(a => a.id === args.agent_id);
      if (!agent) return `❌ Agent not found: ${args.agent_id}`;
@@ -883,6 +938,7 @@ export async function initBot(config, api, tools, skills, agents) {

      // ── Load conversation history for this chat ──
      const chatKey = conversation._key(ctx.chat.id, ctx.message?.message_thread_id);
+      svc.currentChatId = ctx.chat.id; // Track for TTS auto-send
      const history = await conversation.getContext(chatKey, text);

      // Create stream consumer for real-time edit-in-place