fix: voice STT path fix + execSync error handling + fetch download

2026-05-05 17:56:22 +00:00
parent 3b6a7ec502
commit 3bfd842998
2 changed files with 74 additions and 73 deletions
--- a/scripts/stt.py
+++ b/scripts/stt.py
@@ -1,83 +1,70 @@
 #!/usr/bin/env python3
-"""
-Vosk STT — Transcribe OGG/voice to text.
-Usage: python3 stt.py <input_file> [language]
-  input_file: path to audio file (ogg, wav, mp3, etc.)
-  language: 'en' (default) or 'ge' — Georgian model
-Output: JSON to stdout: {"text": "...", "confidence": 0.95}
-Exit codes: 0=success, 1=no speech, 2=error
-"""
+"""Vosk STT — transcribe audio file to text. Optimized for speed."""
 import sys, os, json, subprocess, tempfile, wave

+os.environ['VOSK_LOG_LEVEL'] = '-1'
+
 def main():
    if len(sys.argv) < 2:
-        print(json.dumps({"error": "Usage: stt.py <audio_file> [en|ge]"}))
+        print(json.dumps({"error": "Usage: stt.py <audio_file>"}))
        sys.exit(2)

    audio_file = sys.argv[1]
-    lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
+    model_path = '/home/uroma2/vosk-model'

-    # Suppress vosk logging
-    os.environ['VOSK_LOG_LEVEL'] = '-1'
-
-    model_path = {
-        'en': '/home/uroma2/vosk-model',
-        'ge': '/home/uroma2/vosk-model-ge',
-    }.get(lang, '/home/uroma2/vosk-model')
-
-    if not os.path.isdir(model_path):
-        print(json.dumps({"error": f"Model not found: {model_path}"}))
+    # Convert to 16kHz mono WAV via ffmpeg — fast pipe, no temp file overhead
+    try:
+        proc = subprocess.Popen(
+            ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1',
+             '-f', 'wav', '-v', 'error', '-'],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        wav_data = proc.stdout.read()
+        proc.wait(timeout=15)
+        if proc.returncode != 0 or len(wav_data) < 44:
+            print(json.dumps({"error": "ffmpeg conversion failed"}))
+            sys.exit(2)
+    except Exception as e:
+        print(json.dumps({"error": str(e)}))
        sys.exit(2)

-    # Convert to 16kHz mono WAV using ffmpeg
-    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
-        wav_path = tmp.name
-
+    # Write wav_data to temp file for wave module (it needs a file path)
+    tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    try:
-        result = subprocess.run(
-            ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path],
-            capture_output=True, timeout=30
-        )
-        if result.returncode != 0:
-            print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"}))
-            sys.exit(2)
+        tmp.write(wav_data)
+        tmp.close()

        import vosk
        model = vosk.Model(model_path)
        rec = vosk.KaldiRecognizer(model, 16000)

-        wf = wave.open(wav_path, 'rb')
-        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
-            print(json.dumps({"error": "Audio format mismatch after conversion"}))
-            sys.exit(2)
-
-        results = []
-        while True:
-            data = wf.readframes(4000)
-            if len(data) == 0:
-                break
-            if rec.AcceptWaveform(data):
-                results.append(json.loads(rec.Result()))
-
-        # Final result
-        final = json.loads(rec.FinalResult())
-        results.append(final)
-
-        # Extract text
+        wf = wave.open(tmp.name, 'rb')
        text_parts = []
        total_conf = 0
        conf_count = 0
-        for r in results:
-            t = r.get('text', '').strip()
-            if t:
-                text_parts.append(t)
-            # Confidence from final result
-            if 'result' in r:
-                for word in r.get('result', []):
-                    if 'conf' in word:
-                        total_conf += word['conf']
+
+        while True:
+            data = wf.readframes(4000)
+            if not data:
+                break
+            if rec.AcceptWaveform(data):
+                r = json.loads(rec.Result())
+                t = r.get('text', '').strip()
+                if t:
+                    text_parts.append(t)
+                    for w in r.get('result', []):
+                        total_conf += w.get('conf', 0)
                        conf_count += 1

+        # Final partial
+        r = json.loads(rec.FinalResult())
+        t = r.get('text', '').strip()
+        if t:
+            text_parts.append(t)
+            for w in r.get('result', []):
+                total_conf += w.get('conf', 0)
+                conf_count += 1
+
        text = ' '.join(text_parts).strip()
        confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0

@@ -86,13 +73,12 @@ def main():
            sys.exit(1)

        print(json.dumps({"text": text, "confidence": confidence}))
-
    except Exception as e:
        print(json.dumps({"error": str(e)}))
        sys.exit(2)
    finally:
-        if os.path.exists(wav_path):
-            os.unlink(wav_path)
+        try: os.unlink(tmp.name)
+        except: pass

 if __name__ == '__main__':
    main()
--- a/src/bot/index.js
+++ b/src/bot/index.js
@@ -1088,21 +1088,36 @@ export async function initBot(config, api, tools, skills, agents) {
      const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
      const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;

-      // Download voice file
+      // Download voice file via fetch (faster than curl subprocess)
      const { execSync } = await import('child_process');
-      execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 });
+      const voiceResp = await fetch(url);
+      if (!voiceResp.ok) throw new Error(`Download failed: ${voiceResp.status}`);
+      const { writeFileSync, unlinkSync } = await import('fs');
+      writeFileSync(oggPath, Buffer.from(await voiceResp.arrayBuffer()));
      logger.info(`Voice downloaded: ${oggPath}`);

-      // Run Vosk STT via Python script
-      const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname;
-      const result = execSync(
-        `python3 "${sttScript}" "${oggPath}" 2>/dev/null`,
-        { timeout: 30000, encoding: 'utf-8' }
-      );
-      const parsed = JSON.parse(result.trim());
-
-      // Cleanup
-      execSync(`rm -f "${oggPath}"`);
+      // Run Vosk STT — path is ../../scripts/stt.py from src/bot/
+      const sttScript = new URL('../../scripts/stt.py', import.meta.url).pathname;
+      let parsed;
+      try {
+        const result = execSync(
+          `python3 "${sttScript}" "${oggPath}"`,
+          { timeout: 30000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
+        );
+        parsed = JSON.parse(result.trim());
+      } catch (e) {
+        // exit code 1 = no speech detected, stdout still has JSON
+        const stdout = e.stdout?.trim();
+        if (stdout) {
+          try { parsed = JSON.parse(stdout); } catch { parsed = null; }
+        }
+        if (!parsed || !parsed.text) {
+          await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
+            '🎤 Could not detect speech in the voice message.');
+          return;
+        }
+      }
+      unlinkSync(oggPath);

      if (parsed.error) {
        logger.error(`STT error: ${parsed.error}`);