From d6c31f6f7a1881521107583ac80d9043fd43a846 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Tue, 17 Mar 2026 09:03:27 -0400 Subject: [PATCH] Fix mel spectrogram preprocessor allocating gigabytes of planned memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dynamic dimension max was computed as max_audio_len * n_samples (samples per 30s chunk), not max_audio_len * sampling_rate. With max_audio_len=300, this produced 144M samples (150 minutes) instead of 4.8M (5 minutes), causing a ~3.3 GB planned buffer for STFT intermediates. For streaming mode, the max was even worse: 600 * 480K = 288M samples, producing a 6.6 GB planned buffer — even though streaming processes ~1640 samples per step. Fix both paths: - Offline: use max_audio_len * sampling_rate (300s → 4.8M samples, ~110 MB) - Streaming: cap at 2 seconds (32K samples, ~0.7 MB) --- extension/audio/mel_spectrogram.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/extension/audio/mel_spectrogram.py b/extension/audio/mel_spectrogram.py index 50b9ded01af..4d7180854f1 100644 --- a/extension/audio/mel_spectrogram.py +++ b/extension/audio/mel_spectrogram.py @@ -192,10 +192,15 @@ def export_processor(model=None, output_file="whisper_preprocess.pte"): if model is None: model = WhisperAudioProcessor() - audio_tensor = torch.randn(93680) + if model.streaming: + # Streaming processes small windows per step. 2 seconds gives + # comfortable headroom while keeping the memory plan tight. + max_samples = 2 * model.sampling_rate + else: + max_samples = model.max_audio_len * model.sampling_rate + audio_tensor = torch.randn(min(93680, max_samples)) shapes_collection = torch.export.ShapesCollection() - max_n_chunks = int(model.max_audio_len * model.n_samples) - shapes_collection[audio_tensor] = {0: Dim.DYNAMIC(max=max_n_chunks)} + shapes_collection[audio_tensor] = {0: Dim.DYNAMIC(max=max_samples)} with torch.no_grad(), torch.fx.experimental._config.patch( backed_size_oblivious=True ):