99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Vosk STT — Transcribe OGG/voice to text.
|
|
Usage: python3 stt.py <input_file> [language]
|
|
input_file: path to audio file (ogg, wav, mp3, etc.)
|
|
language: 'en' (default) or 'ge' — Georgian model
|
|
Output: JSON to stdout: {"text": "...", "confidence": 0.95}
|
|
Exit codes: 0=success, 1=no speech, 2=error
|
|
"""
|
|
import sys, os, json, subprocess, tempfile, wave
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(json.dumps({"error": "Usage: stt.py <audio_file> [en|ge]"}))
|
|
sys.exit(2)
|
|
|
|
audio_file = sys.argv[1]
|
|
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
|
|
|
|
# Suppress vosk logging
|
|
os.environ['VOSK_LOG_LEVEL'] = '-1'
|
|
|
|
model_path = {
|
|
'en': '/home/uroma2/vosk-model',
|
|
'ge': '/home/uroma2/vosk-model-ge',
|
|
}.get(lang, '/home/uroma2/vosk-model')
|
|
|
|
if not os.path.isdir(model_path):
|
|
print(json.dumps({"error": f"Model not found: {model_path}"}))
|
|
sys.exit(2)
|
|
|
|
# Convert to 16kHz mono WAV using ffmpeg
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
|
wav_path = tmp.name
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path],
|
|
capture_output=True, timeout=30
|
|
)
|
|
if result.returncode != 0:
|
|
print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"}))
|
|
sys.exit(2)
|
|
|
|
import vosk
|
|
model = vosk.Model(model_path)
|
|
rec = vosk.KaldiRecognizer(model, 16000)
|
|
|
|
wf = wave.open(wav_path, 'rb')
|
|
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
|
|
print(json.dumps({"error": "Audio format mismatch after conversion"}))
|
|
sys.exit(2)
|
|
|
|
results = []
|
|
while True:
|
|
data = wf.readframes(4000)
|
|
if len(data) == 0:
|
|
break
|
|
if rec.AcceptWaveform(data):
|
|
results.append(json.loads(rec.Result()))
|
|
|
|
# Final result
|
|
final = json.loads(rec.FinalResult())
|
|
results.append(final)
|
|
|
|
# Extract text
|
|
text_parts = []
|
|
total_conf = 0
|
|
conf_count = 0
|
|
for r in results:
|
|
t = r.get('text', '').strip()
|
|
if t:
|
|
text_parts.append(t)
|
|
# Confidence from final result
|
|
if 'result' in r:
|
|
for word in r.get('result', []):
|
|
if 'conf' in word:
|
|
total_conf += word['conf']
|
|
conf_count += 1
|
|
|
|
text = ' '.join(text_parts).strip()
|
|
confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0
|
|
|
|
if not text:
|
|
print(json.dumps({"text": "", "confidence": 0}))
|
|
sys.exit(1)
|
|
|
|
print(json.dumps({"text": text, "confidence": confidence}))
|
|
|
|
except Exception as e:
|
|
print(json.dumps({"error": str(e)}))
|
|
sys.exit(2)
|
|
finally:
|
|
if os.path.exists(wav_path):
|
|
os.unlink(wav_path)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|