feat: add Vosk STT - offline voice-to-text, no API key needed

2026-05-05 17:50:50 +00:00
parent 6685f60855
commit 3b6a7ec502
3 changed files with 216 additions and 9 deletions
--- a/src/bot/index.js
+++ b/src/bot/index.js
@@ -901,7 +901,7 @@ export async function initBot(config, api, tools, skills, agents) {
  });

  bot.command('voice', async (ctx) => {
-    await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\nVoice recording is available via the TS service layer.\nSend me a voice message and I will transcribe it.`);
+    await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\n🎤→📝 *Speech-to-Text*: Send a voice message — transcribed via Vosk (offline, no API key, ~200ms).\n📝→🎤 *Text-to-Speech*: Ask the AI to use the \`tts\` tool — generates voice via Edge TTS (free).\n\nNo API keys needed. Runs fully on the server.`);
  });

  bot.command('mcp', async (ctx) => {
@@ -1015,14 +1015,15 @@ export async function initBot(config, api, tools, skills, agents) {
  }

  // ── Message text handler (with dedup + queue + self-correction) ──
-  bot.on('message:text', async (ctx) => {
+  // ── Text message handler (shared by text & voice) ──
+  async function handleTextMessage(ctx, text, isVoice = false) {
    if (isDuplicate(ctx.message.message_id)) return;
    markProcessed(ctx.message.message_id);

    const key = buildSessionKey(ctx.chat.id, ctx.message?.message_thread_id);
-    const text = ctx.message.text;
    const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
-    logger.info(`💬 ${user}: ${text.substring(0, 80)}…`);
+    const prefix = isVoice ? '🎤' : '💬';
+    logger.info(`${prefix} ${user}: ${text.substring(0, 80)}…`);

    await queueRequest(key, text, async () => {
      await ctx.api.sendChatAction(ctx.chat.id, 'typing');
@@ -1069,17 +1070,66 @@ export async function initBot(config, api, tools, skills, agents) {
      // ── Self-learning: extract patterns from this interaction ──
      await selfLearn(text, result, memory);
    });
+  }
+
+  bot.on('message:text', async (ctx) => {
+    await handleTextMessage(ctx, ctx.message.text, false);
  });

-  // ── Voice handler ──
+  // ── Voice handler (Vosk STT) ──
  bot.on('message:voice', async (ctx) => {
    const fileId = ctx.message.voice.file_id;
    const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
    logger.info(`🎤 Voice from ${user}`);
-    await ctx.reply('🎤 Voice received! (STT via Whisper TBD)');
-    const file = await ctx.api.getFile(fileId);
-    const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
-    logger.info(`Voice file: ${url}`);
+    const statusMsg = await ctx.reply('🎤 Transcribing…');
+
+    try {
+      const file = await ctx.api.getFile(fileId);
+      const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
+      const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;
+
+      // Download voice file
+      const { execSync } = await import('child_process');
+      execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 });
+      logger.info(`Voice downloaded: ${oggPath}`);
+
+      // Run Vosk STT via Python script
+      const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname;
+      const result = execSync(
+        `python3 "${sttScript}" "${oggPath}" 2>/dev/null`,
+        { timeout: 30000, encoding: 'utf-8' }
+      );
+      const parsed = JSON.parse(result.trim());
+
+      // Cleanup
+      execSync(`rm -f "${oggPath}"`);
+
+      if (parsed.error) {
+        logger.error(`STT error: ${parsed.error}`);
+        await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
+          `❌ STT error: ${parsed.error}`);
+        return;
+      }
+
+      if (!parsed.text) {
+        await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
+          '🎤 Could not detect speech in the voice message.');
+        return;
+      }
+
+      logger.info(`🎤 STT (${parsed.confidence || '?'}): ${parsed.text}`);
+      await ctx.api.deleteMessage(ctx.chat.id, statusMsg.message_id);
+
+      // Feed transcribed text into the main chat pipeline
+      await handleTextMessage(ctx, parsed.text, true);
+
+    } catch (err) {
+      logger.error(`Voice handler error: ${err.message}`);
+      try {
+        await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
+          `❌ Voice processing failed: ${err.message.slice(0, 100)}`);
+      } catch {}
+    }
  });

  // ── Photo handler ──