#!/usr/bin/env python3 """Vosk STT — transcribe audio file to text. Optimized for speed.""" import sys, os, json, subprocess, tempfile, wave os.environ['VOSK_LOG_LEVEL'] = '-1' def main(): if len(sys.argv) < 2: print(json.dumps({"error": "Usage: stt.py "})) sys.exit(2) audio_file = sys.argv[1] model_path = '/home/uroma2/vosk-model' # Convert to 16kHz mono WAV via ffmpeg — fast pipe, no temp file overhead try: proc = subprocess.Popen( ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', '-v', 'error', '-'], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) wav_data = proc.stdout.read() proc.wait(timeout=15) if proc.returncode != 0 or len(wav_data) < 44: print(json.dumps({"error": "ffmpeg conversion failed"})) sys.exit(2) except Exception as e: print(json.dumps({"error": str(e)})) sys.exit(2) # Write wav_data to temp file for wave module (it needs a file path) tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) try: tmp.write(wav_data) tmp.close() import vosk model = vosk.Model(model_path) rec = vosk.KaldiRecognizer(model, 16000) wf = wave.open(tmp.name, 'rb') text_parts = [] total_conf = 0 conf_count = 0 while True: data = wf.readframes(4000) if not data: break if rec.AcceptWaveform(data): r = json.loads(rec.Result()) t = r.get('text', '').strip() if t: text_parts.append(t) for w in r.get('result', []): total_conf += w.get('conf', 0) conf_count += 1 # Final partial r = json.loads(rec.FinalResult()) t = r.get('text', '').strip() if t: text_parts.append(t) for w in r.get('result', []): total_conf += w.get('conf', 0) conf_count += 1 text = ' '.join(text_parts).strip() confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0 if not text: print(json.dumps({"text": "", "confidence": 0})) sys.exit(1) print(json.dumps({"text": text, "confidence": confidence})) except Exception as e: print(json.dumps({"error": str(e)})) sys.exit(2) finally: try: os.unlink(tmp.name) except: pass if __name__ == '__main__': main()