diff --git a/src/common/ai/factory.js b/src/common/ai/factory.js index 6afe2a86..a5215c5d 100644 --- a/src/common/ai/factory.js +++ b/src/common/ai/factory.js @@ -84,6 +84,14 @@ const PROVIDERS = { { id: 'whisper-medium', name: 'Whisper Medium (769M)' }, ], }, + 'soniox': { + name: 'Soniox', + handler: () => require("./providers/soniox"), + llmModels: [], + sttModels: [ + { id: 'en_v2', name: 'Soniox English v2' } + ], + }, }; function sanitizeModelId(model) { diff --git a/src/common/ai/providers/soniox.js b/src/common/ai/providers/soniox.js new file mode 100644 index 00000000..2fbfdf4b --- /dev/null +++ b/src/common/ai/providers/soniox.js @@ -0,0 +1,122 @@ +// Soniox STT Provider +// https://soniox.com/docs + +const https = require('https'); +const EventEmitter = require('events'); + +class SonioxSTTSession extends EventEmitter { + constructor(apiKey, model = 'en_v2', sessionId) { + super(); + this.apiKey = apiKey; + this.model = model; + this.sessionId = sessionId || `soniox_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; + this.isRunning = false; + this.audioBuffer = Buffer.alloc(0); + this.lastTranscription = ''; + } + + async initialize() { + this.isRunning = true; + // Soniox does not require model download, just API key + return true; + } + + async transcribe(audioBuffer) { + // See https://soniox.com/docs/speech-recognition/api.html#recognize-audio + return new Promise((resolve, reject) => { + const options = { + hostname: 'api.soniox.com', + path: `/v2/recognize`, + method: 'POST', + headers: { + 'Authorization': `Bearer ${this.apiKey}`, + 'Content-Type': 'audio/wav', + 'Accept': 'application/json', + 'soniox-model': this.model + } + }; + const req = https.request(options, res => { + let data = ''; + res.on('data', chunk => data += chunk); + res.on('end', () => { + try { + const json = JSON.parse(data); + resolve(json); + } catch (e) { + reject(e); + } + }); + }); + req.on('error', reject); + req.write(audioBuffer); + req.end(); + }); + } + + async processAudioChunk(audioBuffer) { + if (!this.isRunning) return; + try { + const result = await this.transcribe(audioBuffer); + const text = result.text || ''; + this.lastTranscription = text; + this.emit('transcription', { text, isFinal: true }); + } catch (err) { + this.emit('error', err); + } + } + + async sendRealtimeInput(audioBuffer) { + // Accepts Buffer or base64 string + if (typeof audioBuffer === 'string') { + audioBuffer = Buffer.from(audioBuffer, 'base64'); + } + // Convert to WAV if not already (assume PCM input) + if (!isWav(audioBuffer)) { + audioBuffer = pcmToWav(audioBuffer); + } + await this.processAudioChunk(audioBuffer); + } + + stop() { + this.isRunning = false; + } + + close() { + this.stop(); + this.removeAllListeners(); + } +} + +// Helper: Convert PCM to WAV (16-bit, mono, 16kHz) +function pcmToWav(buffer, sampleRate = 16000, numChannels = 1) { + const header = Buffer.alloc(44); + const dataLength = buffer.length; + header.write('RIFF', 0); // ChunkID + header.writeUInt32LE(36 + dataLength, 4); // ChunkSize + header.write('WAVE', 8); // Format + header.write('fmt ', 12); // Subchunk1ID + header.writeUInt32LE(16, 16); // Subchunk1Size + header.writeUInt16LE(1, 20); // AudioFormat (PCM) + header.writeUInt16LE(numChannels, 22); // NumChannels + header.writeUInt32LE(sampleRate, 24); // SampleRate + header.writeUInt32LE(sampleRate * numChannels * 2, 28); // ByteRate + header.writeUInt16LE(numChannels * 2, 32); // BlockAlign + header.writeUInt16LE(16, 34); // BitsPerSample + header.write('data', 36); // Subchunk2ID + header.writeUInt32LE(dataLength, 40); // Subchunk2Size + return Buffer.concat([header, buffer]); +} + +// Helper: Check if buffer is already a WAV file +function isWav(buffer) { + return buffer && buffer.length > 12 && buffer.toString('ascii', 0, 4) === 'RIFF' && buffer.toString('ascii', 8, 12) === 'WAVE'; +} + +function createSTT(opts) { + return new SonioxSTTSession(opts.apiKey, opts.model); +} + +module.exports = { + createSTT, + SonioxSTTSession +}; diff --git a/src/common/services/modelStateService.js b/src/common/services/modelStateService.js index b5bed8bf..c672d1b5 100644 --- a/src/common/services/modelStateService.js +++ b/src/common/services/modelStateService.js @@ -284,6 +284,21 @@ class ModelStateService { // Default to success if no specific validator is found console.warn(`[ModelStateService] No validateApiKey function for provider: ${provider}. Assuming valid.`); return { success: true }; + + } + case 'soniox': { + // Soniox API key is a 32+ char string, optionally validate format or do a real API call + if (typeof key !== 'string' || key.length < 32) { + return { success: false, error: 'Invalid Soniox API key format.' }; + } + + this.setApiKey(provider, key); + console.log(`[ModelStateService] API key for ${provider} is valid.`); + return { success: true }; + } + default: + return { success: false, error: 'Unknown provider.' }; + } try { diff --git a/src/features/listen/stt/sttService.js b/src/features/listen/stt/sttService.js index 39f7aa8f..16ceb8cf 100644 --- a/src/features/listen/stt/sttService.js +++ b/src/features/listen/stt/sttService.js @@ -133,37 +133,18 @@ class SttService { return; } - if (this.modelInfo.provider === 'whisper') { - // Whisper STT emits 'transcription' events with different structure + if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') { + // Whisper and Soniox STT emit 'transcription' events with similar structure if (message.text && message.text.trim()) { const finalText = message.text.trim(); - - // Filter out Whisper noise transcriptions + // Filter out noise for Whisper, for Soniox just check length const noisePatterns = [ - '[BLANK_AUDIO]', - '[INAUDIBLE]', - '[MUSIC]', - '[SOUND]', - '[NOISE]', - '(BLANK_AUDIO)', - '(INAUDIBLE)', - '(MUSIC)', - '(SOUND)', - '(NOISE)' + '[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]', + '(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)' ]; - - - - const normalizedText = finalText.toLowerCase().trim(); - - const isNoise = noisePatterns.some(pattern => - finalText.includes(pattern) || finalText === pattern - ); - - + const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern); if (!isNoise && finalText.length > 2) { this.debounceMyCompletion(finalText); - this.sendToRenderer('stt-update', { speaker: 'Me', text: finalText, @@ -171,7 +152,7 @@ class SttService { isFinal: true, timestamp: Date.now(), }); - } else { + } else if (this.modelInfo.provider === 'whisper') { console.log(`[Whisper-Me] Filtered noise: "${finalText}"`); } } @@ -246,37 +227,16 @@ class SttService { return; } - if (this.modelInfo.provider === 'whisper') { - // Whisper STT emits 'transcription' events with different structure + if (this.modelInfo.provider === 'whisper' || this.modelInfo.provider === 'soniox') { if (message.text && message.text.trim()) { const finalText = message.text.trim(); - - // Filter out Whisper noise transcriptions const noisePatterns = [ - '[BLANK_AUDIO]', - '[INAUDIBLE]', - '[MUSIC]', - '[SOUND]', - '[NOISE]', - '(BLANK_AUDIO)', - '(INAUDIBLE)', - '(MUSIC)', - '(SOUND)', - '(NOISE)' + '[BLANK_AUDIO]', '[INAUDIBLE]', '[MUSIC]', '[SOUND]', '[NOISE]', + '(BLANK_AUDIO)', '(INAUDIBLE)', '(MUSIC)', '(SOUND)', '(NOISE)' ]; - - - const normalizedText = finalText.toLowerCase().trim(); - - const isNoise = noisePatterns.some(pattern => - finalText.includes(pattern) || finalText === pattern - ); - - - // Only process if it's not noise, not a false positive, and has meaningful content + const isNoise = this.modelInfo.provider === 'whisper' && noisePatterns.some(pattern => finalText.includes(pattern) || finalText === pattern); if (!isNoise && finalText.length > 2) { this.debounceTheirCompletion(finalText); - this.sendToRenderer('stt-update', { speaker: 'Them', text: finalText, @@ -284,7 +244,7 @@ class SttService { isFinal: true, timestamp: Date.now(), }); - } else { + } else if (this.modelInfo.provider === 'whisper') { console.log(`[Whisper-Them] Filtered noise: "${finalText}"`); } } @@ -613,4 +573,4 @@ class SttService { } } -module.exports = SttService; \ No newline at end of file +module.exports = SttService; \ No newline at end of file