From 99e89bcb2cf9b170f8e27e8934c7c4a66658b799 Mon Sep 17 00:00:00 2001 From: clflushopt <172141496+clflushopt@users.noreply.github.com> Date: Sat, 15 Feb 2025 16:41:29 -0500 Subject: [PATCH 1/4] feat(examples): Add an example of boundary analysis for AND/OR exprs The goal of this change is to add an example to explain data flow during boundary analysis of AND and OR expressions. --- datafusion-examples/examples/expr_api.rs | 83 +++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 2908edbb754d1..6b8d31fc3e634 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use datafusion::common::stats::Precision; use datafusion::common::tree_node::{Transformed, TreeNode}; -use datafusion::common::DFSchema; +use datafusion::common::{ColumnStatistics, DFSchema}; use datafusion::common::{ScalarValue, ToDFSchema}; use datafusion::error::Result; use datafusion::functions_aggregate::first_last::first_value_udaf; @@ -80,6 +81,9 @@ async fn main() -> Result<()> { // See how to analyze ranges in expressions range_analysis_demo()?; + // See how to analyze boundaries in different kinds of expressions. + boundary_analysis_and_selectivity_demo()?; + // See how to determine the data types of expressions expression_type_demo()?; @@ -275,6 +279,83 @@ fn range_analysis_demo() -> Result<()> { Ok(()) } +// DataFusion expression boundary analysis framework allows it to infer +fn boundary_analysis_and_selectivity_demo() -> Result<()> { + // Case 1: Simple range predicate similar to id >= 5000 + let id_greater_5000 = col("id").gt_eq(lit(5000i64)); + + // Case 2: Compound predicate with AND (similar to a > 5 AND a < 7) + let id_between_5_7 = col("id").gt(lit(5i64)).and(col("id").lt(lit(7i64))); + + // Case 3: Compound predicate with OR (similar to a > 8 OR a = 1) + let id_gt8_or_eq1 = col("id").gt(lit(8i64)).or(col("id").eq(lit(1i64))); + + // Define our schema with an 'id' column + let schema = Arc::new(Schema::new(vec![make_field("id", DataType::Int64)])); + + // Create initial boundaries for our analysis + // We'll set known boundaries of id being between 1 and 10000 + let mut column_stats = ColumnStatistics::new_unknown(); + column_stats.min_value = Precision::Exact(ScalarValue::Int64(Some(1))); + column_stats.max_value = Precision::Exact(ScalarValue::Int64(Some(10000))); + + let initial_boundaries = + vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?]; + + // Create analysis context and schema for evaluation + let df_schema = DFSchema::try_from(schema.clone())?; + let ctx = SessionContext::new(); + + // Analyze Case 1: id >= 5000 + let physical_expr1 = ctx.create_physical_expr(id_greater_5000, &df_schema)?; + let analysis1 = analyze( + &physical_expr1, + AnalysisContext::new(initial_boundaries.clone()), + df_schema.as_ref(), + )?; + + // The analysis should show: + // - Minimum value: 5000 + // - Maximum value: 10000 + // - Selectivity estimate: ~0.5 as it selects half the range frm the statistics + // we previously specified. + println!("Analysis for id >= 5000:"); + println!( + "Boundaries: {:?} and Selectivity: {:?}", + analysis1.boundaries[0], analysis1.selectivity + ); + + // Analyze Case 2: id > 5 AND id < 7 + let physical_expr2 = ctx.create_physical_expr(id_between_5_7, &df_schema)?; + let analysis2 = analyze( + &physical_expr2, + AnalysisContext::new(initial_boundaries.clone()), + df_schema.as_ref(), + )?; + + // The analysis should show: + // - Minimum value: 6 + // - Maximum value: 6 + // - Very low selectivity as it's a point query + // Proper support for boundary analysis will return [false, true] as that's + // what the expression can potentially evaluate to. + println!("\nAnalysis for id > 5 AND id < 7:"); + println!("Boundaries: {:?}", analysis2.boundaries[0]); + + // Analyze Case 3: id > 8 OR id = 1 (Unsupported) + // + // Currently interval arithmetic is not implemented for the OR operator so + // the below call to analysis will return an error. + let physical_expr3 = ctx.create_physical_expr(id_gt8_or_eq1, &df_schema)?; + let _ = analyze( + &physical_expr3, + AnalysisContext::new(initial_boundaries.clone()), + df_schema.as_ref(), + )?; + + Ok(()) +} + fn make_field(name: &str, data_type: DataType) -> Field { let nullable = false; Field::new(name, data_type, nullable) From 9f22358c4a46d95f7df4e9dc499590f2c2d2fca0 Mon Sep 17 00:00:00 2001 From: clflushopt <172141496+clflushopt@users.noreply.github.com> Date: Sat, 15 Feb 2025 21:49:40 -0500 Subject: [PATCH 2/4] fix(examples): refine demo code for the example and cut the number of cases --- datafusion-examples/examples/expr_api.rs | 104 +++++++++++------------ 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 6b8d31fc3e634..366bc7fe25047 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -279,79 +279,73 @@ fn range_analysis_demo() -> Result<()> { Ok(()) } -// DataFusion expression boundary analysis framework allows it to infer +// DataFusion's analysis can infer boundary statistics and selectivity in +// various situations which can be helpful in building more efficient +// query plans. fn boundary_analysis_and_selectivity_demo() -> Result<()> { - // Case 1: Simple range predicate similar to id >= 5000 + // Consider the example where we want all rows with an `id` greater than + // 5000. let id_greater_5000 = col("id").gt_eq(lit(5000i64)); - // Case 2: Compound predicate with AND (similar to a > 5 AND a < 7) - let id_between_5_7 = col("id").gt(lit(5i64)).and(col("id").lt(lit(7i64))); - - // Case 3: Compound predicate with OR (similar to a > 8 OR a = 1) - let id_gt8_or_eq1 = col("id").gt(lit(8i64)).or(col("id").eq(lit(1i64))); - - // Define our schema with an 'id' column + // As in most examples we must tell DaataFusion the type of the column. let schema = Arc::new(Schema::new(vec![make_field("id", DataType::Int64)])); - // Create initial boundaries for our analysis - // We'll set known boundaries of id being between 1 and 10000 - let mut column_stats = ColumnStatistics::new_unknown(); - column_stats.min_value = Precision::Exact(ScalarValue::Int64(Some(1))); - column_stats.max_value = Precision::Exact(ScalarValue::Int64(Some(10000))); - + // DataFusion is able to do cardinality estimation on various column types + // these estimates represented by the `ColumnStatistics` type describe + // properties such as the maximum and minimum value, the number of distinct + // values and the number of null values. + let column_stats = ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(ScalarValue::Int64(Some(10000))), + min_value: Precision::Exact(ScalarValue::Int64(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + }; + + // We can then build our expression boundaries from the column statistics + // allowing the analysis to be more precise. let initial_boundaries = vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?]; - // Create analysis context and schema for evaluation + // With the above we can perform the boundary analysis similar to the previous + // example. let df_schema = DFSchema::try_from(schema.clone())?; - let ctx = SessionContext::new(); - // Analyze Case 1: id >= 5000 - let physical_expr1 = ctx.create_physical_expr(id_greater_5000, &df_schema)?; - let analysis1 = analyze( + // Analysis case id >= 5000 + let physical_expr1 = + SessionContext::new().create_physical_expr(id_greater_5000, &df_schema)?; + let analysis = analyze( &physical_expr1, AnalysisContext::new(initial_boundaries.clone()), df_schema.as_ref(), )?; - // The analysis should show: - // - Minimum value: 5000 - // - Maximum value: 10000 - // - Selectivity estimate: ~0.5 as it selects half the range frm the statistics - // we previously specified. - println!("Analysis for id >= 5000:"); - println!( - "Boundaries: {:?} and Selectivity: {:?}", - analysis1.boundaries[0], analysis1.selectivity + // The analysis will return better bounds thanks to the column statistics. + // TODO: + assert_eq!( + analysis + .boundaries + .get(0) + .and_then(|boundary| Some(boundary.interval.clone().unwrap().into_bounds())), + Some(( + ScalarValue::Int64(Some(5000)), + ScalarValue::Int64(Some(10000)) + )) ); - // Analyze Case 2: id > 5 AND id < 7 - let physical_expr2 = ctx.create_physical_expr(id_between_5_7, &df_schema)?; - let analysis2 = analyze( - &physical_expr2, - AnalysisContext::new(initial_boundaries.clone()), - df_schema.as_ref(), - )?; - - // The analysis should show: - // - Minimum value: 6 - // - Maximum value: 6 - // - Very low selectivity as it's a point query - // Proper support for boundary analysis will return [false, true] as that's - // what the expression can potentially evaluate to. - println!("\nAnalysis for id > 5 AND id < 7:"); - println!("Boundaries: {:?}", analysis2.boundaries[0]); - - // Analyze Case 3: id > 8 OR id = 1 (Unsupported) + // We can also infer selectivity from the column statistics by assuming + // that the column is uniformly distributed and using the following + // estimation formula: + // Assuming the original range is [a, b] and the new range: [a', b'] // - // Currently interval arithmetic is not implemented for the OR operator so - // the below call to analysis will return an error. - let physical_expr3 = ctx.create_physical_expr(id_gt8_or_eq1, &df_schema)?; - let _ = analyze( - &physical_expr3, - AnalysisContext::new(initial_boundaries.clone()), - df_schema.as_ref(), - )?; + // (a' - b' + 1) / (a - b) + // (10000 - 5000 + 1) / (10000 - 1) + assert_eq!( + analysis + .selectivity + .is_some_and(|selectivity| selectivity >= 0.5 && selectivity <= 6.), + true + ); Ok(()) } From deca461367a088a363bafb38069c74d4d1c0f089 Mon Sep 17 00:00:00 2001 From: clflushopt <172141496+clflushopt@users.noreply.github.com> Date: Sun, 16 Feb 2025 11:08:06 -0500 Subject: [PATCH 3/4] fix(examples): remove left-over --- datafusion-examples/examples/expr_api.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 366bc7fe25047..90588c0ebc4e5 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -321,7 +321,6 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> { )?; // The analysis will return better bounds thanks to the column statistics. - // TODO: assert_eq!( analysis .boundaries From 21bd522793cb2c9345ae7bde1129dd3a37c7b136 Mon Sep 17 00:00:00 2001 From: clflushopt <172141496+clflushopt@users.noreply.github.com> Date: Sun, 16 Feb 2025 11:11:23 -0500 Subject: [PATCH 4/4] fix(examples): address linting issues --- datafusion-examples/examples/expr_api.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 90588c0ebc4e5..349850df6148b 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -322,10 +322,11 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> { // The analysis will return better bounds thanks to the column statistics. assert_eq!( - analysis - .boundaries - .get(0) - .and_then(|boundary| Some(boundary.interval.clone().unwrap().into_bounds())), + analysis.boundaries.first().map(|boundary| boundary + .interval + .clone() + .unwrap() + .into_bounds()), Some(( ScalarValue::Int64(Some(5000)), ScalarValue::Int64(Some(10000)) @@ -339,12 +340,9 @@ fn boundary_analysis_and_selectivity_demo() -> Result<()> { // // (a' - b' + 1) / (a - b) // (10000 - 5000 + 1) / (10000 - 1) - assert_eq!( - analysis - .selectivity - .is_some_and(|selectivity| selectivity >= 0.5 && selectivity <= 6.), - true - ); + assert!(analysis + .selectivity + .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity))); Ok(()) }