clientsflow/frontend/server/utils/whisper.ts

type WhisperTranscribeInput = {
  samples: Float32Array;
  sampleRate: number;
  language?: string;
};
let whisperPipelinePromise: Promise<any> | null = null;
let transformersPromise: Promise<any> | null = null;

function getWhisperModelId() {
  return (process.env.CF_WHISPER_MODEL ?? "Xenova/whisper-small").trim() || "Xenova/whisper-small";
}

function getWhisperLanguage() {
  const value = (process.env.CF_WHISPER_LANGUAGE ?? "ru").trim();
  return value || "ru";
}

async function getWhisperPipeline() {
  if (!transformersPromise) {
    transformersPromise = import("@xenova/transformers");
  }

  const { env, pipeline } = await transformersPromise;

  if (!whisperPipelinePromise) {
    env.allowRemoteModels = true;
    env.allowLocalModels = true;
    env.cacheDir = "/app/.data/transformers";

    const modelId = getWhisperModelId();
    whisperPipelinePromise = pipeline("automatic-speech-recognition", modelId);
  }

  return whisperPipelinePromise;
}

export async function transcribeWithWhisper(input: WhisperTranscribeInput) {
  const transcriber = (await getWhisperPipeline()) as any;
  const result = await transcriber(
    input.samples,
    {
      sampling_rate: input.sampleRate,
      language: (input.language ?? getWhisperLanguage()) || "ru",
      task: "transcribe",
      chunk_length_s: 20,
      stride_length_s: 5,
      return_timestamps: false,
    },
  );

  const text = String((result as any)?.text ?? "").trim();
  return text;
}