From 86acdfab28d8725df0fda2e4c368845b4887a7e7 Mon Sep 17 00:00:00 2001
From: Mikhail Kilin <makilin@sberbank.ru>
Date: Thu, 19 Feb 2026 13:24:41 +0300
Subject: [PATCH] Add text-to-speech via Piper TTS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Send text message → Piper TTS → WAV → OGG Opus → voice reply.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 k8s/transcribator.yaml |  2 ++
 src/main.zig           | 69 +++++++++++++++++++++++++++++------------
 src/piper.zig          | 70 ++++++++++++++++++++++++++++++++++++++++++
 src/telegram.zig       | 32 +++++++++++++++++++
 4 files changed, 154 insertions(+), 19 deletions(-)
 create mode 100644 src/piper.zig

diff --git a/k8s/transcribator.yaml b/k8s/transcribator.yaml
index f3e89af..ec12667 100644
--- a/k8s/transcribator.yaml
+++ b/k8s/transcribator.yaml
@@ -40,6 +40,8 @@ spec:
               value: "http://whisper.whisper.svc:8000"
             - name: WHISPER_LANGUAGE
               value: "ru"
+            - name: PIPER_URL
+              value: "http://piper.piper.svc:5000"
           resources:
             requests:
               memory: "32Mi"
diff --git a/src/main.zig b/src/main.zig
index 02b5462..e30330e 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,6 +1,7 @@
 const std = @import("std");
 const telegram = @import("telegram.zig");
 const whisper = @import("whisper.zig");
+const piper = @import("piper.zig");
 
 const log = std.log.scoped(.transcribator);
 
@@ -15,11 +16,12 @@ pub fn main() !void {
     };
     const whisper_url = std.posix.getenv("WHISPER_URL") orelse "http://whisper.whisper.svc:8000";
     const language = std.posix.getenv("WHISPER_LANGUAGE") orelse "ru";
+    const piper_url = std.posix.getenv("PIPER_URL") orelse "http://piper.piper.svc:5000";
 
     var bot = try telegram.TelegramBot.init(allocator, token);
     defer bot.deinit();
 
-    log.info("Bot started. Whisper: {s}, language: {s}", .{ whisper_url, language });
+    log.info("Bot started. Whisper: {s}, language: {s}, Piper: {s}", .{ whisper_url, language, piper_url });
 
     var offset: i64 = 0;
 
@@ -33,7 +35,7 @@ pub fn main() !void {
 
         for (updates.parsed.value.result) |update| {
             offset = update.update_id + 1;
-            processUpdate(allocator, &bot, update, whisper_url, language);
+            processUpdate(allocator, &bot, update, whisper_url, language, piper_url);
         }
     }
 }
@@ -44,30 +46,39 @@ fn processUpdate(
     update: telegram.Update,
     whisper_url: []const u8,
     language: []const u8,
+    piper_url: []const u8,
 ) void {
     const message = update.message orelse return;
 
-    var file_id: ?[]const u8 = null;
-    var is_video = false;
-
+    // Voice / video_note → transcription
     if (message.voice) |voice| {
-        file_id = voice.file_id;
-    } else if (message.video_note) |vn| {
-        file_id = vn.file_id;
-        is_video = true;
+        log.info("Processing voice message in chat {d}", .{message.chat.id});
+        handleTranscription(allocator, bot, message, voice.file_id, false, whisper_url, language) catch |err| {
+            log.err("Transcription failed: {s}", .{@errorName(err)});
+            bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {};
+        };
+        return;
     }
 
-    const fid = file_id orelse return;
+    if (message.video_note) |vn| {
+        log.info("Processing video_note message in chat {d}", .{message.chat.id});
+        handleTranscription(allocator, bot, message, vn.file_id, true, whisper_url, language) catch |err| {
+            log.err("Transcription failed: {s}", .{@errorName(err)});
+            bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {};
+        };
+        return;
+    }
 
-    log.info("Processing {s} message in chat {d}", .{
-        if (is_video) @as([]const u8, "video_note") else @as([]const u8, "voice"),
-        message.chat.id,
-    });
-
-    handleTranscription(allocator, bot, message, fid, is_video, whisper_url, language) catch |err| {
-        log.err("Transcription failed: {s}", .{@errorName(err)});
-        bot.sendMessage(message.chat.id, "Transcription failed.", message.message_id) catch {};
-    };
+    // Text message → TTS (skip commands starting with /)
+    if (message.text) |text| {
+        if (text.len > 0 and text[0] != '/') {
+            log.info("Processing TTS for text message in chat {d}", .{message.chat.id});
+            handleTTS(allocator, bot, message, text, piper_url) catch |err| {
+                log.err("TTS failed: {s}", .{@errorName(err)});
+                bot.sendMessage(message.chat.id, "TTS failed.", message.message_id) catch {};
+            };
+        }
+    }
 }
 
 fn handleTranscription(
@@ -140,3 +151,23 @@ fn handleTranscription(
     }
     log.info("Step 4 done", .{});
 }
+
+fn handleTTS(
+    allocator: std.mem.Allocator,
+    bot: *telegram.TelegramBot,
+    message: telegram.Message,
+    text: []const u8,
+    piper_url: []const u8,
+) !void {
+    log.info("TTS step 1: synthesize", .{});
+    const ogg_path = try piper.synthesize(allocator, piper_url, text, message.message_id);
+    defer {
+        std.fs.deleteFileAbsolute(ogg_path) catch {};
+        allocator.free(ogg_path);
+    }
+    log.info("TTS step 1 done: {s}", .{ogg_path});
+
+    log.info("TTS step 2: sendVoice", .{});
+    try bot.sendVoice(message.chat.id, ogg_path, message.message_id);
+    log.info("TTS step 2 done", .{});
+}
diff --git a/src/piper.zig b/src/piper.zig
new file mode 100644
index 0000000..b7d154d
--- /dev/null
+++ b/src/piper.zig
@@ -0,0 +1,70 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+
+const log = std.log.scoped(.transcribator);
+
+pub fn synthesize(allocator: Allocator, piper_url: []const u8, text: []const u8, msg_id: i64) ![]u8 {
+    const wav_path = try std.fmt.allocPrint(allocator, "/tmp/tts_{d}.wav", .{msg_id});
+    defer {
+        std.fs.deleteFileAbsolute(wav_path) catch {};
+        allocator.free(wav_path);
+    }
+
+    const ogg_path = try std.fmt.allocPrint(allocator, "/tmp/tts_{d}.ogg", .{msg_id});
+    errdefer {
+        std.fs.deleteFileAbsolute(ogg_path) catch {};
+        allocator.free(ogg_path);
+    }
+
+    // POST text to Piper TTS, save WAV
+    log.info("Piper TTS: synthesizing {d} chars", .{text.len});
+    {
+        var child = std.process.Child.init(
+            &.{ "curl", "-sf", "--max-time", "120", "-X", "POST", "-H", "Content-Type: text/plain", "--data-raw", text, "-o", wav_path, piper_url },
+            allocator,
+        );
+        child.stdout_behavior = .Ignore;
+        child.stderr_behavior = .Ignore;
+        const term = try child.spawnAndWait();
+
+        switch (term) {
+            .Exited => |code| {
+                if (code != 0) {
+                    log.err("Piper TTS curl failed with exit code {d}", .{code});
+                    return error.HttpRequestFailed;
+                }
+            },
+            else => {
+                log.err("Piper TTS curl terminated abnormally", .{});
+                return error.HttpRequestFailed;
+            },
+        }
+    }
+
+    // Convert WAV to OGG Opus
+    log.info("Piper TTS: converting WAV to OGG", .{});
+    {
+        var child = std.process.Child.init(
+            &.{ "ffmpeg", "-y", "-i", wav_path, "-c:a", "libopus", ogg_path },
+            allocator,
+        );
+        child.stdout_behavior = .Ignore;
+        child.stderr_behavior = .Ignore;
+        const term = try child.spawnAndWait();
+
+        switch (term) {
+            .Exited => |code| {
+                if (code != 0) {
+                    log.err("ffmpeg WAV→OGG failed with exit code {d}", .{code});
+                    return error.HttpRequestFailed;
+                }
+            },
+            else => {
+                log.err("ffmpeg terminated abnormally", .{});
+                return error.HttpRequestFailed;
+            },
+        }
+    }
+
+    return ogg_path;
+}
diff --git a/src/telegram.zig b/src/telegram.zig
index b6154d7..9f9fab7 100644
--- a/src/telegram.zig
+++ b/src/telegram.zig
@@ -15,6 +15,7 @@ pub const VideoNote = struct {
 pub const Message = struct {
     message_id: i64,
     chat: struct { id: i64 },
+    text: ?[]const u8 = null,
     voice: ?Voice = null,
     video_note: ?VideoNote = null,
 };
@@ -134,4 +135,35 @@ pub const TelegramBot = struct {
         const resp = http.httpPostJson(self.allocator, url, json_body) catch return;
         self.allocator.free(resp);
     }
+
+    pub fn sendVoice(self: *TelegramBot, chat_id: i64, ogg_path: []const u8, reply_to: ?i64) !void {
+        const url = try std.fmt.allocPrint(self.allocator, "{s}/sendVoice", .{self.api_base});
+        defer self.allocator.free(url);
+
+        const chat_id_str = try std.fmt.allocPrint(self.allocator, "{d}", .{chat_id});
+        defer self.allocator.free(chat_id_str);
+
+        var fields_buf: [2][2][]const u8 = undefined;
+        var field_count: usize = 1;
+        fields_buf[0] = .{ "chat_id", chat_id_str };
+
+        var reply_str: ?[]u8 = null;
+        defer if (reply_str) |s| self.allocator.free(s);
+
+        if (reply_to) |r| {
+            reply_str = try std.fmt.allocPrint(self.allocator, "{d}", .{r});
+            fields_buf[1] = .{ "reply_to_message_id", reply_str.? };
+            field_count = 2;
+        }
+
+        const resp = try http.httpPostMultipart(
+            self.allocator,
+            url,
+            "voice",
+            ogg_path,
+            "voice.ogg",
+            fields_buf[0..field_count],
+        );
+        self.allocator.free(resp);
+    }
 };