feat(telegram): ingest and render inbound voice messages

This commit is contained in:
Ruslan Bakiev
2026-02-23 12:21:53 +07:00
parent c94c229a1a
commit acd974766a
4 changed files with 225 additions and 5 deletions

View File

@@ -28,6 +28,7 @@ type OmniInboundEnvelopeV1 = {
export const RECEIVER_FLOW_QUEUE_NAME = (process.env.RECEIVER_FLOW_QUEUE_NAME || "receiver.flow").trim();
const TELEGRAM_PLACEHOLDER_PREFIX = "Telegram ";
const TELEGRAM_AUDIO_FILE_MARKER = "tg-file:";
function redisConnectionFromEnv(): ConnectionOptions {
const raw = (process.env.REDIS_URL || "redis://localhost:6379").trim();
@@ -47,6 +48,45 @@ function normalizeText(input: unknown) {
return t || "[no text]";
}
type TelegramInboundMedia = {
kind: "voice" | "audio" | "video_note" | null;
fileId: string | null;
durationSec: number | null;
label: string | null;
};
function parseTelegramInboundMedia(normalized: OmniInboundEnvelopeV1["payloadNormalized"]): TelegramInboundMedia {
const kindRaw = String(normalized.mediaKind ?? "").trim().toLowerCase();
const kind: TelegramInboundMedia["kind"] =
kindRaw === "voice" || kindRaw === "audio" || kindRaw === "video_note"
? kindRaw
: null;
const fileId = asString(normalized.mediaFileId);
const durationRaw = normalized.mediaDurationSec;
const durationParsed =
typeof durationRaw === "number"
? durationRaw
: typeof durationRaw === "string"
? Number(durationRaw)
: Number.NaN;
const durationSec =
Number.isFinite(durationParsed) && durationParsed > 0
? Math.max(1, Math.round(durationParsed))
: null;
const label = asString(normalized.mediaTitle);
return { kind, fileId, durationSec, label };
}
function fallbackTextFromMedia(media: TelegramInboundMedia) {
if (!media.kind) return null;
if (media.kind === "voice") return "[voice message]";
if (media.kind === "video_note") return "[video note]";
if (media.label) return `[audio] ${media.label}`;
return "[audio]";
}
function parseOccurredAt(input: string | null | undefined) {
const d = new Date(String(input ?? ""));
if (Number.isNaN(d.getTime())) return new Date();
@@ -338,7 +378,11 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
}
const businessConnectionId = String(n.businessConnectionId ?? "").trim() || null;
const text = normalizeText(n.text);
const media = parseTelegramInboundMedia(n);
const text = normalizeText(asString(n.text) ?? fallbackTextFromMedia(media));
const isAudioLike = Boolean(media.fileId) && (media.kind === "voice" || media.kind === "audio" || media.kind === "video_note");
const contactMessageKind: "MESSAGE" | "CALL" = isAudioLike ? "CALL" : "MESSAGE";
const contactMessageAudioUrl = isAudioLike ? `${TELEGRAM_AUDIO_FILE_MARKER}${media.fileId}` : null;
const occurredAt = parseOccurredAt(env.occurredAt);
const direction = safeDirection(env.direction);
const contactProfile = buildContactProfile(n, externalContactId);
@@ -376,6 +420,10 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
threadExternalId: externalChatId,
contactExternalId: externalContactId,
businessConnectionId,
mediaKind: media.kind,
mediaFileId: media.fileId,
mediaDurationSec: media.durationSec,
mediaLabel: media.label,
},
payloadNormalized: n,
payloadRaw: (env.payloadRaw ?? null) as Prisma.InputJsonValue,
@@ -431,10 +479,12 @@ async function ingestInbound(env: OmniInboundEnvelopeV1) {
data: {
contactId,
contactInboxId: inbox.id,
kind: "MESSAGE",
kind: contactMessageKind,
direction,
channel: "TELEGRAM",
content: text,
audioUrl: contactMessageAudioUrl,
durationSec: media.durationSec,
occurredAt,
},
});