54 lines
1.4 KiB
TypeScript
54 lines
1.4 KiB
TypeScript
type WhisperTranscribeInput = {
|
|
samples: Float32Array;
|
|
sampleRate: number;
|
|
language?: string;
|
|
};
|
|
let whisperPipelinePromise: Promise<any> | null = null;
|
|
let transformersPromise: Promise<any> | null = null;
|
|
|
|
function getWhisperModelId() {
|
|
return (process.env.CF_WHISPER_MODEL ?? "Xenova/whisper-small").trim() || "Xenova/whisper-small";
|
|
}
|
|
|
|
function getWhisperLanguage() {
|
|
const value = (process.env.CF_WHISPER_LANGUAGE ?? "ru").trim();
|
|
return value || "ru";
|
|
}
|
|
|
|
async function getWhisperPipeline() {
|
|
if (!transformersPromise) {
|
|
transformersPromise = import("@xenova/transformers");
|
|
}
|
|
|
|
const { env, pipeline } = await transformersPromise;
|
|
|
|
if (!whisperPipelinePromise) {
|
|
env.allowRemoteModels = true;
|
|
env.allowLocalModels = true;
|
|
env.cacheDir = "/app/.data/transformers";
|
|
|
|
const modelId = getWhisperModelId();
|
|
whisperPipelinePromise = pipeline("automatic-speech-recognition", modelId);
|
|
}
|
|
|
|
return whisperPipelinePromise;
|
|
}
|
|
|
|
export async function transcribeWithWhisper(input: WhisperTranscribeInput) {
|
|
const transcriber = (await getWhisperPipeline()) as any;
|
|
const result = await transcriber(
|
|
input.samples,
|
|
{
|
|
sampling_rate: input.sampleRate,
|
|
language: (input.language ?? getWhisperLanguage()) || "ru",
|
|
task: "transcribe",
|
|
chunk_length_s: 20,
|
|
stride_length_s: 5,
|
|
return_timestamps: false,
|
|
},
|
|
);
|
|
|
|
const text = String((result as any)?.text ?? "").trim();
|
|
return text;
|
|
}
|