From 86acdfab28d8725df0fda2e4c368845b4887a7e7 Mon Sep 17 00:00:00 2001 From: Mikhail Kilin Date: Thu, 19 Feb 2026 13:24:41 +0300 Subject: [PATCH] Add text-to-speech via Piper TTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Send text message → Piper TTS → WAV → OGG Opus → voice reply. Co-Authored-By: Claude Opus 4.6 --- k8s/transcribator.yaml | 2 ++ src/main.zig | 69 +++++++++++++++++++++++++++++------------ src/piper.zig | 70 ++++++++++++++++++++++++++++++++++++++++++ src/telegram.zig | 32 +++++++++++++++++++ 4 files changed, 154 insertions(+), 19 deletions(-) create mode 100644 src/piper.zig diff --git a/k8s/transcribator.yaml b/k8s/transcribator.yaml index f3e89af..ec12667 100644 --- a/k8s/transcribator.yaml +++ b/k8s/transcribator.yaml @@ -40,6 +40,8 @@ spec: value: "http://whisper.whisper.svc:8000" - name: WHISPER_LANGUAGE value: "ru" + - name: PIPER_URL + value: "http://piper.piper.svc:5000" resources: requests: memory: "32Mi" diff --git a/src/main.zig b/src/main.zig index 02b5462..e30330e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,6 +1,7 @@ const std = @import("std"); const telegram = @import("telegram.zig"); const whisper = @import("whisper.zig"); +const piper = @import("piper.zig"); const log = std.log.scoped(.transcribator); @@ -15,11 +16,12 @@ pub fn main() !void { }; const whisper_url = std.posix.getenv("WHISPER_URL") orelse "http://whisper.whisper.svc:8000"; const language = std.posix.getenv("WHISPER_LANGUAGE") orelse "ru"; + const piper_url = std.posix.getenv("PIPER_URL") orelse "http://piper.piper.svc:5000"; var bot = try telegram.TelegramBot.init(allocator, token); defer bot.deinit(); - log.info("Bot started. Whisper: {s}, language: {s}", .{ whisper_url, language }); + log.info("Bot started. Whisper: {s}, language: {s}, Piper: {s}", .{ whisper_url, language, piper_url }); var offset: i64 = 0; @@ -33,7 +35,7 @@ pub fn main() !void { for (updates.parsed.value.result) |update| { offset = update.update_id + 1; - processUpdate(allocator, &bot, update, whisper_url, language); + processUpdate(allocator, &bot, update, whisper_url, language, piper_url); } } } @@ -44,30 +46,39 @@ fn processUpdate( update: telegram.Update, whisper_url: []const u8, language: []const u8, + piper_url: []const u8, ) void { const message = update.message orelse return; - var file_id: ?[]const u8 = null; - var is_video = false; - + // Voice / video_note → transcription if (message.voice) |voice| { - file_id = voice.file_id; - } else if (message.video_note) |vn| { - file_id = vn.file_id; - is_video = true; + log.info("Processing voice message in chat {d}", .{message.chat.id}); + handleTranscription(allocator, bot, message, voice.file_id, false, whisper_url, language) catch |err| { + log.err("Transcription failed: {s}", .{@errorName(err)}); + bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {}; + }; + return; } - const fid = file_id orelse return; + if (message.video_note) |vn| { + log.info("Processing video_note message in chat {d}", .{message.chat.id}); + handleTranscription(allocator, bot, message, vn.file_id, true, whisper_url, language) catch |err| { + log.err("Transcription failed: {s}", .{@errorName(err)}); + bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {}; + }; + return; + } - log.info("Processing {s} message in chat {d}", .{ - if (is_video) @as([]const u8, "video_note") else @as([]const u8, "voice"), - message.chat.id, - }); - - handleTranscription(allocator, bot, message, fid, is_video, whisper_url, language) catch |err| { - log.err("Transcription failed: {s}", .{@errorName(err)}); - bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {}; - }; + // Text message → TTS (skip commands starting with /) + if (message.text) |text| { + if (text.len > 0 and text[0] != '/') { + log.info("Processing TTS for text message in chat {d}", .{message.chat.id}); + handleTTS(allocator, bot, message, text, piper_url) catch |err| { + log.err("TTS failed: {s}", .{@errorName(err)}); + bot.sendMessage(message.chat.id, "TTS failed.", message.message_id) catch {}; + }; + } + } } fn handleTranscription( @@ -140,3 +151,23 @@ fn handleTranscription( } log.info("Step 4 done", .{}); } + +fn handleTTS( + allocator: std.mem.Allocator, + bot: *telegram.TelegramBot, + message: telegram.Message, + text: []const u8, + piper_url: []const u8, +) !void { + log.info("TTS step 1: synthesize", .{}); + const ogg_path = try piper.synthesize(allocator, piper_url, text, message.message_id); + defer { + std.fs.deleteFileAbsolute(ogg_path) catch {}; + allocator.free(ogg_path); + } + log.info("TTS step 1 done: {s}", .{ogg_path}); + + log.info("TTS step 2: sendVoice", .{}); + try bot.sendVoice(message.chat.id, ogg_path, message.message_id); + log.info("TTS step 2 done", .{}); +} diff --git a/src/piper.zig b/src/piper.zig new file mode 100644 index 0000000..b7d154d --- /dev/null +++ b/src/piper.zig @@ -0,0 +1,70 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; + +const log = std.log.scoped(.transcribator); + +pub fn synthesize(allocator: Allocator, piper_url: []const u8, text: []const u8, msg_id: i64) ![]u8 { + const wav_path = try std.fmt.allocPrint(allocator, "/tmp/tts_{d}.wav", .{msg_id}); + defer { + std.fs.deleteFileAbsolute(wav_path) catch {}; + allocator.free(wav_path); + } + + const ogg_path = try std.fmt.allocPrint(allocator, "/tmp/tts_{d}.ogg", .{msg_id}); + errdefer { + std.fs.deleteFileAbsolute(ogg_path) catch {}; + allocator.free(ogg_path); + } + + // POST text to Piper TTS, save WAV + log.info("Piper TTS: synthesizing {d} chars", .{text.len}); + { + var child = std.process.Child.init( + &.{ "curl", "-sf", "--max-time", "120", "-X", "POST", "-H", "Content-Type: text/plain", "--data-raw", text, "-o", wav_path, piper_url }, + allocator, + ); + child.stdout_behavior = .Ignore; + child.stderr_behavior = .Ignore; + const term = try child.spawnAndWait(); + + switch (term) { + .Exited => |code| { + if (code != 0) { + log.err("Piper TTS curl failed with exit code {d}", .{code}); + return error.HttpRequestFailed; + } + }, + else => { + log.err("Piper TTS curl terminated abnormally", .{}); + return error.HttpRequestFailed; + }, + } + } + + // Convert WAV to OGG Opus + log.info("Piper TTS: converting WAV to OGG", .{}); + { + var child = std.process.Child.init( + &.{ "ffmpeg", "-y", "-i", wav_path, "-c:a", "libopus", ogg_path }, + allocator, + ); + child.stdout_behavior = .Ignore; + child.stderr_behavior = .Ignore; + const term = try child.spawnAndWait(); + + switch (term) { + .Exited => |code| { + if (code != 0) { + log.err("ffmpeg WAV→OGG failed with exit code {d}", .{code}); + return error.HttpRequestFailed; + } + }, + else => { + log.err("ffmpeg terminated abnormally", .{}); + return error.HttpRequestFailed; + }, + } + } + + return ogg_path; +} diff --git a/src/telegram.zig b/src/telegram.zig index b6154d7..9f9fab7 100644 --- a/src/telegram.zig +++ b/src/telegram.zig @@ -15,6 +15,7 @@ pub const VideoNote = struct { pub const Message = struct { message_id: i64, chat: struct { id: i64 }, + text: ?[]const u8 = null, voice: ?Voice = null, video_note: ?VideoNote = null, }; @@ -134,4 +135,35 @@ pub const TelegramBot = struct { const resp = http.httpPostJson(self.allocator, url, json_body) catch return; self.allocator.free(resp); } + + pub fn sendVoice(self: *TelegramBot, chat_id: i64, ogg_path: []const u8, reply_to: ?i64) !void { + const url = try std.fmt.allocPrint(self.allocator, "{s}/sendVoice", .{self.api_base}); + defer self.allocator.free(url); + + const chat_id_str = try std.fmt.allocPrint(self.allocator, "{d}", .{chat_id}); + defer self.allocator.free(chat_id_str); + + var fields_buf: [2][2][]const u8 = undefined; + var field_count: usize = 1; + fields_buf[0] = .{ "chat_id", chat_id_str }; + + var reply_str: ?[]u8 = null; + defer if (reply_str) |s| self.allocator.free(s); + + if (reply_to) |r| { + reply_str = try std.fmt.allocPrint(self.allocator, "{d}", .{r}); + fields_buf[1] = .{ "reply_to_message_id", reply_str.? }; + field_count = 2; + } + + const resp = try http.httpPostMultipart( + self.allocator, + url, + "voice", + ogg_path, + "voice.ogg", + fields_buf[0..field_count], + ); + self.allocator.free(resp); + } };