diff --git a/k8s/transcribator.yaml b/k8s/transcribator.yaml index 3a7c604..e64e43b 100644 --- a/k8s/transcribator.yaml +++ b/k8s/transcribator.yaml @@ -42,6 +42,8 @@ spec: value: "ru" - name: PIPER_URL value: "http://piper.piper.svc:5000" + - name: PIPER_VOICES + value: "irina=http://piper.piper.svc:5000,denis=http://piper-denis.piper.svc:5000,dmitri=http://piper-dmitri.piper.svc:5000" resources: requests: memory: "32Mi" diff --git a/src/main.zig b/src/main.zig index e30330e..06212e3 100644 --- a/src/main.zig +++ b/src/main.zig @@ -5,6 +5,9 @@ const piper = @import("piper.zig"); const log = std.log.scoped(.transcribator); +const VoiceMap = std.StringHashMap([]const u8); +const ActiveChats = std.AutoHashMap(i64, []const u8); + pub fn main() !void { var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; defer _ = gpa.deinit(); @@ -16,12 +19,20 @@ pub fn main() !void { }; const whisper_url = std.posix.getenv("WHISPER_URL") orelse "http://whisper.whisper.svc:8000"; const language = std.posix.getenv("WHISPER_LANGUAGE") orelse "ru"; - const piper_url = std.posix.getenv("PIPER_URL") orelse "http://piper.piper.svc:5000"; + const voices_env = std.posix.getenv("PIPER_VOICES") orelse ""; + const default_piper_url = std.posix.getenv("PIPER_URL") orelse ""; + + var voices = VoiceMap.init(allocator); + defer voices.deinit(); + parseVoices(&voices, voices_env); + + var active_chats = ActiveChats.init(allocator); + defer active_chats.deinit(); var bot = try telegram.TelegramBot.init(allocator, token); defer bot.deinit(); - log.info("Bot started. Whisper: {s}, language: {s}, Piper: {s}", .{ whisper_url, language, piper_url }); + log.info("Bot started. Whisper: {s}, language: {s}, voices: {d}", .{ whisper_url, language, voices.count() }); var offset: i64 = 0; @@ -35,18 +46,35 @@ pub fn main() !void { for (updates.parsed.value.result) |update| { offset = update.update_id + 1; - processUpdate(allocator, &bot, update, whisper_url, language, piper_url); + processUpdate(allocator, &bot, update, whisper_url, language, &voices, &active_chats, default_piper_url); } } } +fn parseVoices(map: *VoiceMap, env: []const u8) void { + if (env.len == 0) return; + var iter = std.mem.splitScalar(u8, env, ','); + while (iter.next()) |entry| { + const trimmed = std.mem.trim(u8, entry, " "); + if (std.mem.indexOfScalar(u8, trimmed, '=')) |sep| { + map.put(trimmed[0..sep], trimmed[sep + 1..]) catch {}; + } + } +} + +fn isPrivateChat(chat_type: []const u8) bool { + return std.mem.eql(u8, chat_type, "private"); +} + fn processUpdate( allocator: std.mem.Allocator, bot: *telegram.TelegramBot, update: telegram.Update, whisper_url: []const u8, language: []const u8, - piper_url: []const u8, + voices: *VoiceMap, + active_chats: *ActiveChats, + default_piper_url: []const u8, ) void { const message = update.message orelse return; @@ -69,11 +97,31 @@ fn processUpdate( return; } - // Text message → TTS (skip commands starting with /) + // Text message if (message.text) |text| { - if (text.len > 0 and text[0] != '/') { + if (text.len == 0) return; + + // Commands + if (text[0] == '/') { + handleCommand(allocator, bot, message, text, voices, active_chats); + return; + } + + // TTS: check active speak mode or private chat default + var piper_url: ?[]const u8 = active_chats.get(message.chat.id); + + if (piper_url == null and isPrivateChat(message.chat.@"type")) { + if (default_piper_url.len > 0) { + piper_url = default_piper_url; + } else { + var it = voices.valueIterator(); + if (it.next()) |v| piper_url = v.*; + } + } + + if (piper_url) |url| { log.info("Processing TTS for text message in chat {d}", .{message.chat.id}); - handleTTS(allocator, bot, message, text, piper_url) catch |err| { + handleTTS(allocator, bot, message, text, url) catch |err| { log.err("TTS failed: {s}", .{@errorName(err)}); bot.sendMessage(message.chat.id, "TTS failed.", message.message_id) catch {}; }; @@ -81,6 +129,72 @@ fn processUpdate( } } +fn handleCommand( + allocator: std.mem.Allocator, + bot: *telegram.TelegramBot, + message: telegram.Message, + text: []const u8, + voices: *VoiceMap, + active_chats: *ActiveChats, +) void { + // Only handle /speak command + if (!std.mem.startsWith(u8, text, "/speak")) return; + + var rest = text["/speak".len..]; + + // Skip @botname suffix (e.g. /speak@my_bot denis) + if (rest.len > 0 and rest[0] == '@') { + if (std.mem.indexOfScalar(u8, rest, ' ')) |space| { + rest = rest[space..]; + } else { + rest = ""; + } + } + + const arg = std.mem.trim(u8, rest, " "); + + // /speak stop or /speak (no args) → disable TTS + if (arg.len == 0 or std.mem.eql(u8, arg, "stop")) { + const was_active = active_chats.remove(message.chat.id); + if (was_active) { + bot.sendMessage(message.chat.id, "TTS отключён.", message.message_id) catch {}; + } else { + // No active TTS, show available voices + const reply = buildVoiceListMessage(allocator, voices, "Доступные голоса: ") catch return; + defer allocator.free(reply); + bot.sendMessage(message.chat.id, reply, message.message_id) catch {}; + } + return; + } + + // /speak → enable TTS with specified voice + if (voices.get(arg)) |url| { + active_chats.put(message.chat.id, url) catch {}; + const reply = std.fmt.allocPrint(allocator, "TTS включён, голос: {s}", .{arg}) catch return; + defer allocator.free(reply); + bot.sendMessage(message.chat.id, reply, message.message_id) catch {}; + } else { + const reply = buildVoiceListMessage(allocator, voices, "Неизвестный голос. Доступные: ") catch return; + defer allocator.free(reply); + bot.sendMessage(message.chat.id, reply, message.message_id) catch {}; + } +} + +fn buildVoiceListMessage(allocator: std.mem.Allocator, voices: *VoiceMap, prefix: []const u8) ![]u8 { + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(allocator); + + try buf.appendSlice(allocator, prefix); + var first = true; + var it = voices.iterator(); + while (it.next()) |entry| { + if (!first) try buf.appendSlice(allocator, ", "); + try buf.appendSlice(allocator, entry.key_ptr.*); + first = false; + } + return buf.toOwnedSlice(allocator); +} + fn handleTranscription( allocator: std.mem.Allocator, bot: *telegram.TelegramBot, @@ -138,16 +252,16 @@ fn handleTranscription( // Transcribe log.info("Step 3: transcribe {s}", .{audio_path}); - const text = try whisper.transcribe(allocator, whisper_url, audio_path, language); - defer allocator.free(text); - log.info("Step 3 done, text length: {d}", .{text.len}); + const transcribed_text = try whisper.transcribe(allocator, whisper_url, audio_path, language); + defer allocator.free(transcribed_text); + log.info("Step 3 done, text length: {d}", .{transcribed_text.len}); // Send response log.info("Step 4: sendMessage", .{}); - if (text.len == 0) { + if (transcribed_text.len == 0) { try bot.sendMessage(message.chat.id, "(empty transcription)", message.message_id); } else { - try bot.sendMessage(message.chat.id, text, message.message_id); + try bot.sendMessage(message.chat.id, transcribed_text, message.message_id); } log.info("Step 4 done", .{}); } diff --git a/src/telegram.zig b/src/telegram.zig index 9f9fab7..9841ead 100644 --- a/src/telegram.zig +++ b/src/telegram.zig @@ -14,7 +14,10 @@ pub const VideoNote = struct { pub const Message = struct { message_id: i64, - chat: struct { id: i64 }, + chat: struct { + id: i64, + @"type": []const u8 = "private", + }, text: ?[]const u8 = null, voice: ?Voice = null, video_note: ?VideoNote = null,