#!/usr/bin/env python3 """ Vosk STT — Transcribe OGG/voice to text. Usage: python3 stt.py [language] input_file: path to audio file (ogg, wav, mp3, etc.) language: 'en' (default) or 'ge' — Georgian model Output: JSON to stdout: {"text": "...", "confidence": 0.95} Exit codes: 0=success, 1=no speech, 2=error """ import sys, os, json, subprocess, tempfile, wave def main(): if len(sys.argv) < 2: print(json.dumps({"error": "Usage: stt.py [en|ge]"})) sys.exit(2) audio_file = sys.argv[1] lang = sys.argv[2] if len(sys.argv) > 2 else 'en' # Suppress vosk logging os.environ['VOSK_LOG_LEVEL'] = '-1' model_path = { 'en': '/home/uroma2/vosk-model', 'ge': '/home/uroma2/vosk-model-ge', }.get(lang, '/home/uroma2/vosk-model') if not os.path.isdir(model_path): print(json.dumps({"error": f"Model not found: {model_path}"})) sys.exit(2) # Convert to 16kHz mono WAV using ffmpeg with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: wav_path = tmp.name try: result = subprocess.run( ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path], capture_output=True, timeout=30 ) if result.returncode != 0: print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"})) sys.exit(2) import vosk model = vosk.Model(model_path) rec = vosk.KaldiRecognizer(model, 16000) wf = wave.open(wav_path, 'rb') if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000: print(json.dumps({"error": "Audio format mismatch after conversion"})) sys.exit(2) results = [] while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): results.append(json.loads(rec.Result())) # Final result final = json.loads(rec.FinalResult()) results.append(final) # Extract text text_parts = [] total_conf = 0 conf_count = 0 for r in results: t = r.get('text', '').strip() if t: text_parts.append(t) # Confidence from final result if 'result' in r: for word in r.get('result', []): if 'conf' in word: total_conf += word['conf'] conf_count += 1 text = ' '.join(text_parts).strip() confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0 if not text: print(json.dumps({"text": "", "confidence": 0})) sys.exit(1) print(json.dumps({"text": text, "confidence": confidence})) except Exception as e: print(json.dumps({"error": str(e)})) sys.exit(2) finally: if os.path.exists(wav_path): os.unlink(wav_path) if __name__ == '__main__': main()