From d6c31f6f7a1881521107583ac80d9043fd43a846 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 17 Mar 2026 09:03:27 -0400
Subject: [PATCH] Fix mel spectrogram preprocessor allocating gigabytes of
 planned memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dynamic dimension max was computed as max_audio_len * n_samples
(samples per 30s chunk), not max_audio_len * sampling_rate. With
max_audio_len=300, this produced 144M samples (150 minutes) instead of
4.8M (5 minutes), causing a ~3.3 GB planned buffer for STFT
intermediates.

For streaming mode, the max was even worse: 600 * 480K = 288M samples,
producing a 6.6 GB planned buffer — even though streaming processes
~1640 samples per step.

Fix both paths:
- Offline: use max_audio_len * sampling_rate (300s → 4.8M samples, ~110 MB)
- Streaming: cap at 2 seconds (32K samples, ~0.7 MB)
---
 extension/audio/mel_spectrogram.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/extension/audio/mel_spectrogram.py b/extension/audio/mel_spectrogram.py
index 50b9ded01af..4d7180854f1 100644
--- a/extension/audio/mel_spectrogram.py
+++ b/extension/audio/mel_spectrogram.py
@@ -192,10 +192,15 @@ def export_processor(model=None, output_file="whisper_preprocess.pte"):
     if model is None:
         model = WhisperAudioProcessor()
 
-    audio_tensor = torch.randn(93680)
+    if model.streaming:
+        # Streaming processes small windows per step. 2 seconds gives
+        # comfortable headroom while keeping the memory plan tight.
+        max_samples = 2 * model.sampling_rate
+    else:
+        max_samples = model.max_audio_len * model.sampling_rate
+    audio_tensor = torch.randn(min(93680, max_samples))
     shapes_collection = torch.export.ShapesCollection()
-    max_n_chunks = int(model.max_audio_len * model.n_samples)
-    shapes_collection[audio_tensor] = {0: Dim.DYNAMIC(max=max_n_chunks)}
+    shapes_collection[audio_tensor] = {0: Dim.DYNAMIC(max=max_samples)}
     with torch.no_grad(), torch.fx.experimental._config.patch(
         backed_size_oblivious=True
     ):