diff --git a/scripts/stt.py b/scripts/stt.py index 2c51e378..23a24e51 100644 --- a/scripts/stt.py +++ b/scripts/stt.py @@ -1,83 +1,70 @@ #!/usr/bin/env python3 -""" -Vosk STT — Transcribe OGG/voice to text. -Usage: python3 stt.py [language] - input_file: path to audio file (ogg, wav, mp3, etc.) - language: 'en' (default) or 'ge' — Georgian model -Output: JSON to stdout: {"text": "...", "confidence": 0.95} -Exit codes: 0=success, 1=no speech, 2=error -""" +"""Vosk STT — transcribe audio file to text. Optimized for speed.""" import sys, os, json, subprocess, tempfile, wave +os.environ['VOSK_LOG_LEVEL'] = '-1' + def main(): if len(sys.argv) < 2: - print(json.dumps({"error": "Usage: stt.py [en|ge]"})) + print(json.dumps({"error": "Usage: stt.py "})) sys.exit(2) audio_file = sys.argv[1] - lang = sys.argv[2] if len(sys.argv) > 2 else 'en' + model_path = '/home/uroma2/vosk-model' - # Suppress vosk logging - os.environ['VOSK_LOG_LEVEL'] = '-1' - - model_path = { - 'en': '/home/uroma2/vosk-model', - 'ge': '/home/uroma2/vosk-model-ge', - }.get(lang, '/home/uroma2/vosk-model') - - if not os.path.isdir(model_path): - print(json.dumps({"error": f"Model not found: {model_path}"})) + # Convert to 16kHz mono WAV via ffmpeg — fast pipe, no temp file overhead + try: + proc = subprocess.Popen( + ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', + '-f', 'wav', '-v', 'error', '-'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + wav_data = proc.stdout.read() + proc.wait(timeout=15) + if proc.returncode != 0 or len(wav_data) < 44: + print(json.dumps({"error": "ffmpeg conversion failed"})) + sys.exit(2) + except Exception as e: + print(json.dumps({"error": str(e)})) sys.exit(2) - # Convert to 16kHz mono WAV using ffmpeg - with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: - wav_path = tmp.name - + # Write wav_data to temp file for wave module (it needs a file path) + tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) try: - result = subprocess.run( - ['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path], - capture_output=True, timeout=30 - ) - if result.returncode != 0: - print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"})) - sys.exit(2) + tmp.write(wav_data) + tmp.close() import vosk model = vosk.Model(model_path) rec = vosk.KaldiRecognizer(model, 16000) - wf = wave.open(wav_path, 'rb') - if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000: - print(json.dumps({"error": "Audio format mismatch after conversion"})) - sys.exit(2) - - results = [] - while True: - data = wf.readframes(4000) - if len(data) == 0: - break - if rec.AcceptWaveform(data): - results.append(json.loads(rec.Result())) - - # Final result - final = json.loads(rec.FinalResult()) - results.append(final) - - # Extract text + wf = wave.open(tmp.name, 'rb') text_parts = [] total_conf = 0 conf_count = 0 - for r in results: - t = r.get('text', '').strip() - if t: - text_parts.append(t) - # Confidence from final result - if 'result' in r: - for word in r.get('result', []): - if 'conf' in word: - total_conf += word['conf'] + + while True: + data = wf.readframes(4000) + if not data: + break + if rec.AcceptWaveform(data): + r = json.loads(rec.Result()) + t = r.get('text', '').strip() + if t: + text_parts.append(t) + for w in r.get('result', []): + total_conf += w.get('conf', 0) conf_count += 1 + # Final partial + r = json.loads(rec.FinalResult()) + t = r.get('text', '').strip() + if t: + text_parts.append(t) + for w in r.get('result', []): + total_conf += w.get('conf', 0) + conf_count += 1 + text = ' '.join(text_parts).strip() confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0 @@ -86,13 +73,12 @@ def main(): sys.exit(1) print(json.dumps({"text": text, "confidence": confidence})) - except Exception as e: print(json.dumps({"error": str(e)})) sys.exit(2) finally: - if os.path.exists(wav_path): - os.unlink(wav_path) + try: os.unlink(tmp.name) + except: pass if __name__ == '__main__': main() diff --git a/src/bot/index.js b/src/bot/index.js index af9d7e87..0b477251 100644 --- a/src/bot/index.js +++ b/src/bot/index.js @@ -1088,21 +1088,36 @@ export async function initBot(config, api, tools, skills, agents) { const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`; const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`; - // Download voice file + // Download voice file via fetch (faster than curl subprocess) const { execSync } = await import('child_process'); - execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 }); + const voiceResp = await fetch(url); + if (!voiceResp.ok) throw new Error(`Download failed: ${voiceResp.status}`); + const { writeFileSync, unlinkSync } = await import('fs'); + writeFileSync(oggPath, Buffer.from(await voiceResp.arrayBuffer())); logger.info(`Voice downloaded: ${oggPath}`); - // Run Vosk STT via Python script - const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname; - const result = execSync( - `python3 "${sttScript}" "${oggPath}" 2>/dev/null`, - { timeout: 30000, encoding: 'utf-8' } - ); - const parsed = JSON.parse(result.trim()); - - // Cleanup - execSync(`rm -f "${oggPath}"`); + // Run Vosk STT — path is ../../scripts/stt.py from src/bot/ + const sttScript = new URL('../../scripts/stt.py', import.meta.url).pathname; + let parsed; + try { + const result = execSync( + `python3 "${sttScript}" "${oggPath}"`, + { timeout: 30000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] } + ); + parsed = JSON.parse(result.trim()); + } catch (e) { + // exit code 1 = no speech detected, stdout still has JSON + const stdout = e.stdout?.trim(); + if (stdout) { + try { parsed = JSON.parse(stdout); } catch { parsed = null; } + } + if (!parsed || !parsed.text) { + await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id, + '🎤 Could not detect speech in the voice message.'); + return; + } + } + unlinkSync(oggPath); if (parsed.error) { logger.error(`STT error: ${parsed.error}`);