apache · kosiew · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/datafusion/core/benches/sql_planner_extended.rs b/datafusion/core/benches/sql_planner_extended.rs
@@ -18,7 +18,10 @@
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow_schema::DataType;
 use arrow_schema::TimeUnit::Nanosecond;
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use criterion::{
+    BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main,
+    measurement::WallTime,
+};
 use datafusion::prelude::{DataFrame, SessionContext};
 use datafusion_catalog::MemTable;
 use datafusion_common::ScalarValue;
@@ -27,12 +30,17 @@ use datafusion_expr::{cast, col, lit, not, try_cast, when};
 use datafusion_functions::expr_fn::{
     btrim, length, regexp_like, regexp_replace, to_timestamp, upper,
 };
+use std::env;
 use std::fmt::Write;
 use std::hint::black_box;
 use std::ops::Rem;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
 
+const FULL_PREDICATE_SWEEP: [usize; 5] = [10, 20, 30, 40, 60];
+const FULL_DEPTH_SWEEP: [usize; 3] = [1, 2, 3];
+const DEFAULT_SWEEP_POINTS: [(usize, usize); 3] = [(10, 1), (30, 2), (60, 3)];
+
 // This benchmark suite is designed to test the performance of
 // logical planning with a large plan containing unions, many columns
 // with a variety of operations in it.
@@ -252,26 +260,6 @@ fn build_case_heavy_left_join_query(predicate_count: usize, case_depth: usize) -
     query
 }
 
-fn build_case_heavy_left_join_df_with_push_down_filter(
-    rt: &Runtime,
-    predicate_count: usize,
-    case_depth: usize,
-    push_down_filter_enabled: bool,
-) -> DataFrame {
-    let ctx = SessionContext::new();
-    register_string_table(&ctx, 100, 1000);
-    if !push_down_filter_enabled {
-        let removed = ctx.remove_optimizer_rule("push_down_filter");
-        assert!(
-            removed,
-            "push_down_filter rule should be present in the default optimizer"
-        );
-    }
-
-    let query = build_case_heavy_left_join_query(predicate_count, case_depth);
-    rt.block_on(async { ctx.sql(&query).await.unwrap() })
-}
-
 fn build_non_case_left_join_query(
     predicate_count: usize,
     nesting_depth: usize,
@@ -304,10 +292,11 @@ fn build_non_case_left_join_query(
     query
 }
 
-fn build_non_case_left_join_df_with_push_down_filter(
+fn build_left_join_df_with_push_down_filter(
     rt: &Runtime,
+    query_builder: impl Fn(usize, usize) -> String,
     predicate_count: usize,
-    nesting_depth: usize,
+    depth: usize,
     push_down_filter_enabled: bool,
 ) -> DataFrame {
     let ctx = SessionContext::new();
@@ -320,10 +309,103 @@ fn build_non_case_left_join_df_with_push_down_filter(
         );
     }
 
-    let query = build_non_case_left_join_query(predicate_count, nesting_depth);
+    let query = query_builder(predicate_count, depth);
     rt.block_on(async { ctx.sql(&query).await.unwrap() })
 }
 
+fn build_case_heavy_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    case_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
+    build_left_join_df_with_push_down_filter(
+        rt,
+        build_case_heavy_left_join_query,
+        predicate_count,
+        case_depth,
+        push_down_filter_enabled,
+    )
+}
+
+fn build_non_case_left_join_df_with_push_down_filter(
+    rt: &Runtime,
+    predicate_count: usize,
+    nesting_depth: usize,
+    push_down_filter_enabled: bool,
+) -> DataFrame {
+    build_left_join_df_with_push_down_filter(
+        rt,
+        build_non_case_left_join_query,
+        predicate_count,
+        nesting_depth,
+        push_down_filter_enabled,
+    )
+}
+
+fn include_full_push_down_filter_sweep() -> bool {
+    env::var("DATAFUSION_PUSH_DOWN_FILTER_FULL_SWEEP")
+        .map(|value| value == "1" || value.eq_ignore_ascii_case("true"))
+        .unwrap_or(false)
+}
+
+fn push_down_filter_sweep_points() -> Vec<(usize, usize)> {
+    if include_full_push_down_filter_sweep() {
+        FULL_DEPTH_SWEEP
+            .into_iter()
+            .flat_map(|depth| {
+                FULL_PREDICATE_SWEEP
+                    .into_iter()
+                    .map(move |predicate_count| (predicate_count, depth))
+            })
+            .collect()
+    } else {
+        DEFAULT_SWEEP_POINTS.to_vec()
+    }
+}
+
+fn bench_push_down_filter_ab<BuildFn>(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    rt: &Runtime,
+    sweep_points: &[(usize, usize)],
+    build_df: BuildFn,
+) where
+    BuildFn: Fn(&Runtime, usize, usize, bool) -> DataFrame,
+{
+    for &(predicate_count, depth) in sweep_points {
+        let with_push_down_filter = build_df(rt, predicate_count, depth, true);
+        let without_push_down_filter = build_df(rt, predicate_count, depth, false);
+
+        let input_label = format!("predicates={predicate_count},nesting_depth={depth}");
+
+        group.bench_with_input(
+            BenchmarkId::new("with_push_down_filter", &input_label),
+            &with_push_down_filter,
+            |b, df| {
+                b.iter(|| {
+                    let df_clone = df.clone();
+                    black_box(
+                        rt.block_on(async { df_clone.into_optimized_plan().unwrap() }),
+                    );
+                })
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("without_push_down_filter", &input_label),
+            &without_push_down_filter,
+            |b, df| {
+                b.iter(|| {
+                    let df_clone = df.clone();
+                    black_box(
+                        rt.block_on(async { df_clone.into_optimized_plan().unwrap() }),
+                    );
+                })
+            },
+        );
+    }
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let baseline_ctx = SessionContext::new();
     let case_heavy_ctx = SessionContext::new();
@@ -349,116 +431,40 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    let predicate_sweep = [10, 20, 30, 40, 60];
-    let case_depth_sweep = [1, 2, 3];
+    let sweep_points = push_down_filter_sweep_points();
 
     let mut hotspot_group =
         c.benchmark_group("push_down_filter_hotspot_case_heavy_left_join_ab");
-    for case_depth in case_depth_sweep {
-        for predicate_count in predicate_sweep {
-            let with_push_down_filter =
-                build_case_heavy_left_join_df_with_push_down_filter(
-                    &rt,
-                    predicate_count,
-                    case_depth,
-                    true,
-                );
-            let without_push_down_filter =
-                build_case_heavy_left_join_df_with_push_down_filter(
-                    &rt,
-                    predicate_count,
-                    case_depth,
-                    false,
-                );
-
-            let input_label =
-                format!("predicates={predicate_count},case_depth={case_depth}");
-            // A/B interpretation:
-            // - with_push_down_filter: default optimizer path (rule enabled)
-            // - without_push_down_filter: control path with the rule removed
-            // Compare both IDs at the same sweep point to isolate rule impact.
-            hotspot_group.bench_with_input(
-                BenchmarkId::new("with_push_down_filter", &input_label),
-                &with_push_down_filter,
-                |b, df| {
-                    b.iter(|| {
-                        let df_clone = df.clone();
-                        black_box(
-                            rt.block_on(async {
-                                df_clone.into_optimized_plan().unwrap()
-                            }),
-                        );
-                    })
-                },
-            );
-            hotspot_group.bench_with_input(
-                BenchmarkId::new("without_push_down_filter", &input_label),
-                &without_push_down_filter,
-                |b, df| {
-                    b.iter(|| {
-                        let df_clone = df.clone();
-                        black_box(
-                            rt.block_on(async {
-                                df_clone.into_optimized_plan().unwrap()
-                            }),
-                        );
-                    })
-                },
-            );
-        }
-    }
+    bench_push_down_filter_ab(
+        &mut hotspot_group,
+        &rt,
+        &sweep_points,
+        |rt, predicate_count, depth, enable| {
+            build_case_heavy_left_join_df_with_push_down_filter(
+                rt,
+                predicate_count,
+                depth,
+                enable,
+            )
+        },
+    );
     hotspot_group.finish();
 
     let mut control_group =
         c.benchmark_group("push_down_filter_control_non_case_left_join_ab");
-    for nesting_depth in case_depth_sweep {
-        for predicate_count in predicate_sweep {
-            let with_push_down_filter = build_non_case_left_join_df_with_push_down_filter(
-                &rt,
+    bench_push_down_filter_ab(
+        &mut control_group,
+        &rt,
+        &sweep_points,
+        |rt, predicate_count, depth, enable| {
+            build_non_case_left_join_df_with_push_down_filter(
+                rt,
                 predicate_count,
-                nesting_depth,
-                true,
-            );
-            let without_push_down_filter =
-                build_non_case_left_join_df_with_push_down_filter(
-                    &rt,
-                    predicate_count,
-                    nesting_depth,
-                    false,
-                );
-
-            let input_label =
-                format!("predicates={predicate_count},nesting_depth={nesting_depth}");
-            control_group.bench_with_input(
-                BenchmarkId::new("with_push_down_filter", &input_label),
-                &with_push_down_filter,
-                |b, df| {
-                    b.iter(|| {
-                        let df_clone = df.clone();
-                        black_box(
-                            rt.block_on(async {
-                                df_clone.into_optimized_plan().unwrap()
-                            }),
-                        );
-                    })
-                },
-            );
-            control_group.bench_with_input(
-                BenchmarkId::new("without_push_down_filter", &input_label),
-                &without_push_down_filter,
-                |b, df| {
-                    b.iter(|| {
-                        let df_clone = df.clone();
-                        black_box(
-                            rt.block_on(async {
-                                df_clone.into_optimized_plan().unwrap()
-                            }),
-                        );
-                    })
-                },
-            );
-        }
-    }
+                depth,
+                enable,
+            )
+        },
+    );
     control_group.finish();
 }
 

diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
@@ -67,6 +67,7 @@ pub mod create_drop;
 pub mod explain_analyze;
 pub mod joins;
 mod path_partition;
+mod push_down_filter_regressions;
 mod runtime_config;
 pub mod select;
 mod sql_api;