feat(telegram): ingest and render inbound voice messages

2026-02-23 12:21:53 +07:00
parent c94c229a1a
commit acd974766a
4 changed files with 225 additions and 5 deletions
--- a/omni_chat/src/worker.ts
+++ b/omni_chat/src/worker.ts
@@ -28,6 +28,7 @@ type OmniInboundEnvelopeV1 = {

 export const RECEIVER_FLOW_QUEUE_NAME = (process.env.RECEIVER_FLOW_QUEUE_NAME || "receiver.flow").trim();
 const TELEGRAM_PLACEHOLDER_PREFIX = "Telegram ";
+const TELEGRAM_AUDIO_FILE_MARKER = "tg-file:";

 function redisConnectionFromEnv(): ConnectionOptions {
  const raw = (process.env.REDIS_URL || "redis://localhost:6379").trim();
@@ -47,6 +48,45 @@ function normalizeText(input: unknown) {
  return t || "[no text]";
 }

+type TelegramInboundMedia = {
+  kind: "voice" | "audio" | "video_note" | null;
+  fileId: string | null;
+  durationSec: number | null;
+  label: string | null;
+};
+
+function parseTelegramInboundMedia(normalized: OmniInboundEnvelopeV1["payloadNormalized"]): TelegramInboundMedia {
+  const kindRaw = String(normalized.mediaKind ?? "").trim().toLowerCase();
+  const kind: TelegramInboundMedia["kind"] =
+    kindRaw === "voice" || kindRaw === "audio" || kindRaw === "video_note"
+      ? kindRaw
+      : null;
+
+  const fileId = asString(normalized.mediaFileId);
+  const durationRaw = normalized.mediaDurationSec;
+  const durationParsed =
+    typeof durationRaw === "number"
+      ? durationRaw
+      : typeof durationRaw === "string"
+        ? Number(durationRaw)
+        : Number.NaN;
+  const durationSec =
+    Number.isFinite(durationParsed) && durationParsed > 0
+      ? Math.max(1, Math.round(durationParsed))
+      : null;
+
+  const label = asString(normalized.mediaTitle);
+  return { kind, fileId, durationSec, label };
+}
+
+function fallbackTextFromMedia(media: TelegramInboundMedia) {
+  if (!media.kind) return null;
+  if (media.kind === "voice") return "[voice message]";
+  if (media.kind === "video_note") return "[video note]";
+  if (media.label) return `[audio] ${media.label}`;
+  return "[audio]";
+}
+
 function parseOccurredAt(input: string | null | undefined) {
  const d = new Date(String(input ?? ""));
  if (Number.isNaN(d.getTime())) return new Date();
@@ -338,7 +378,11 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
  }

  const businessConnectionId = String(n.businessConnectionId ?? "").trim() || null;
-  const text = normalizeText(n.text);
+  const media = parseTelegramInboundMedia(n);
+  const text = normalizeText(asString(n.text) ?? fallbackTextFromMedia(media));
+  const isAudioLike = Boolean(media.fileId) && (media.kind === "voice" || media.kind === "audio" || media.kind === "video_note");
+  const contactMessageKind: "MESSAGE" | "CALL" = isAudioLike ? "CALL" : "MESSAGE";
+  const contactMessageAudioUrl = isAudioLike ? `${TELEGRAM_AUDIO_FILE_MARKER}${media.fileId}` : null;
  const occurredAt = parseOccurredAt(env.occurredAt);
  const direction = safeDirection(env.direction);
  const contactProfile = buildContactProfile(n, externalContactId);
@@ -376,6 +420,10 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
      threadExternalId: externalChatId,
      contactExternalId: externalContactId,
      businessConnectionId,
+      mediaKind: media.kind,
+      mediaFileId: media.fileId,
+      mediaDurationSec: media.durationSec,
+      mediaLabel: media.label,
    },
    payloadNormalized: n,
    payloadRaw: (env.payloadRaw ?? null) as Prisma.InputJsonValue,
@@ -431,10 +479,12 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
    data: {
      contactId,
      contactInboxId: inbox.id,
-      kind: "MESSAGE",
+      kind: contactMessageKind,
      direction,
      channel: "TELEGRAM",
      content: text,
+      audioUrl: contactMessageAudioUrl,
+      durationSec: media.durationSec,
      occurredAt,
    },
  });