feat: add Vosk STT - offline voice-to-text, no API key needed

2026-05-05 17:50:50 +00:00
parent 6685f60855
commit 3b6a7ec502
3 changed files with 216 additions and 9 deletions
--- a/scripts/stt.py
+++ b/scripts/stt.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""
+Vosk STT — Transcribe OGG/voice to text.
+Usage: python3 stt.py <input_file> [language]
+  input_file: path to audio file (ogg, wav, mp3, etc.)
+  language: 'en' (default) or 'ge' — Georgian model
+Output: JSON to stdout: {"text": "...", "confidence": 0.95}
+Exit codes: 0=success, 1=no speech, 2=error
+"""
+import sys, os, json, subprocess, tempfile, wave
+
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "Usage: stt.py <audio_file> [en|ge]"}))
+        sys.exit(2)
+
+    audio_file = sys.argv[1]
+    lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
+
+    # Suppress vosk logging
+    os.environ['VOSK_LOG_LEVEL'] = '-1'
+
+    model_path = {
+        'en': '/home/uroma2/vosk-model',
+        'ge': '/home/uroma2/vosk-model-ge',
+    }.get(lang, '/home/uroma2/vosk-model')
+
+    if not os.path.isdir(model_path):
+        print(json.dumps({"error": f"Model not found: {model_path}"}))
+        sys.exit(2)
+
+    # Convert to 16kHz mono WAV using ffmpeg
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+        wav_path = tmp.name
+
+    try:
+        result = subprocess.run(
+            ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path],
+            capture_output=True, timeout=30
+        )
+        if result.returncode != 0:
+            print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"}))
+            sys.exit(2)
+
+        import vosk
+        model = vosk.Model(model_path)
+        rec = vosk.KaldiRecognizer(model, 16000)
+
+        wf = wave.open(wav_path, 'rb')
+        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
+            print(json.dumps({"error": "Audio format mismatch after conversion"}))
+            sys.exit(2)
+
+        results = []
+        while True:
+            data = wf.readframes(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                results.append(json.loads(rec.Result()))
+
+        # Final result
+        final = json.loads(rec.FinalResult())
+        results.append(final)
+
+        # Extract text
+        text_parts = []
+        total_conf = 0
+        conf_count = 0
+        for r in results:
+            t = r.get('text', '').strip()
+            if t:
+                text_parts.append(t)
+            # Confidence from final result
+            if 'result' in r:
+                for word in r.get('result', []):
+                    if 'conf' in word:
+                        total_conf += word['conf']
+                        conf_count += 1
+
+        text = ' '.join(text_parts).strip()
+        confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0
+
+        if not text:
+            print(json.dumps({"text": "", "confidence": 0}))
+            sys.exit(1)
+
+        print(json.dumps({"text": text, "confidence": confidence}))
+
+    except Exception as e:
+        print(json.dumps({"error": str(e)}))
+        sys.exit(2)
+    finally:
+        if os.path.exists(wav_path):
+            os.unlink(wav_path)
+
+if __name__ == '__main__':
+    main()