diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs index 75cc5d9514cbd..e8a9358b71082 100644 --- a/datafusion/functions/src/regex/mod.rs +++ b/datafusion/functions/src/regex/mod.rs @@ -23,6 +23,7 @@ use std::collections::HashMap; use std::collections::hash_map::Entry; use std::sync::Arc; pub mod regexpcount; +pub mod regexpextract; pub mod regexpinstr; pub mod regexplike; pub mod regexpmatch; @@ -30,6 +31,7 @@ pub mod regexpreplace; // create UDFs make_udf_function!(regexpcount::RegexpCountFunc, regexp_count); +make_udf_function!(regexpextract::RegexpExtractFunc, regexp_extract); make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr); make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match); make_udf_function!(regexplike::RegexpLikeFunc, regexp_like); @@ -65,6 +67,19 @@ pub mod expr_fn { super::regexp_match().call(args) } + /// Extracts a group that matches `regexp`. If `idx` is not specified, + /// it defaults to 1. + /// + /// Matches Spark's DataFrame API: `regexp_extract(e: Column, exp: String, groupIdx: Int)` + /// and the SQL syntax: `regexp_extract(str, regexp[, idx])` + pub fn regexp_extract(values: Expr, regex: Expr, idx: Option) -> Expr { + let mut args = vec![values, regex]; + if let Some(idx) = idx { + args.push(idx); + } + super::regexp_extract().call(args) + } + /// Returns index of regular expression matches in a string. pub fn regexp_instr( values: Expr, @@ -125,6 +140,7 @@ pub fn functions() -> Vec> { regexp_instr(), regexp_like(), regexp_replace(), + regexp_extract(), ] } diff --git a/datafusion/functions/src/regex/regexpextract.rs b/datafusion/functions/src/regex/regexpextract.rs new file mode 100644 index 0000000000000..25757d89ce297 --- /dev/null +++ b/datafusion/functions/src/regex/regexpextract.rs @@ -0,0 +1,551 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Regex expressions +use arrow::array::{Array, ArrayRef, Int32Array, StringArray, StringBuilder}; +use arrow::datatypes::DataType; +use arrow::error::ArrowError; +use datafusion_common::exec_err; +use datafusion_common::{DataFusionError, Result}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature}; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_macros::user_doc; +use regex::Regex; +use std::any::Any; +use std::sync::Arc; + +// See https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html +// See https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala#L863 + +#[user_doc( + doc_section(label = "Regular Expression Functions"), + description = "Extract the first string in the `str` that match the `regexp` expression and corresponding to the regex group index", + syntax_example = "regexp_extract(str, regexp[, idx])", + sql_example = r#"```sql + > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1); + +---------------------------------------------------------+ + | 100 | + +---------------------------------------------------------+ + > SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 2); + +---------------------------------------------------------+ + | 200 | + +---------------------------------------------------------+ +``` +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs) +"#, + argument(name = "str", description = "Column or column name"), + argument( + name = "regexp", + description = r#"a string representing a regular expression. The regex string should be a + Java regular expression.

+ Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL + parser, see the unescaping rules at String Literal. + For example, to match "\abc", a regular expression for `regexp` can be "^\\abc$".

+ There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to + fallback to the Spark 1.6 behavior regarding string literal parsing. For example, + if the config is enabled, the `regexp` that can match "\abc" is "^\abc$".

+ It's recommended to use a raw string literal (with the `r` prefix) to avoid escaping + special characters in the pattern string if exists."# + ), + argument( + name = "idx", + description = r#"an integer expression that representing the group index. The regex maybe contains + multiple groups. `idx` indicates which regex group to extract. The group index should + be non-negative. The minimum value of `idx` is 0, which means matching the entire + regular expression. If `idx` is not specified, the default group index value is 1. + This parameter is optional; when omitted the function defaults to extracting the first + capture group (idx=1), matching Spark's behavior."# + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct RegexpExtractFunc { + signature: Signature, +} + +impl Default for RegexpExtractFunc { + fn default() -> Self { + Self::new() + } +} + +impl RegexpExtractFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + // Spark Catalyst Expression: RegExpExtract(subject, regexp, idx) + // where idx defaults to 1 when omitted. + // See: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala + // + // 2-arg form: regexp_extract(str, regexp) — idx defaults to 1 + // Matches Spark's: def this(s: Expression, r: Expression) = this(s, r, Literal(1)) + TypeSignature::Exact(vec![Utf8, Utf8]), + // 3-arg form: regexp_extract(str, regexp, idx) + TypeSignature::Exact(vec![Utf8, Utf8, Int32]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RegexpExtractFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "regexp_extract" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + // Spark's RegExpExtract always returns StringType + match arg_types.len() { + 2 | 3 => match arg_types[0] { + Utf8 => Ok(Utf8), + _ => exec_err!("regexp_extract only supports Utf8 for arg0"), + }, + _ => exec_err!( + "regexp_extract expects 2 or 3 arguments, got {}", + arg_types.len() + ), + } + } + + fn invoke_with_args( + &self, + args: datafusion_expr::ScalarFunctionArgs, + ) -> Result { + let args = &args.args; + + if args.len() != 2 && args.len() != 3 { + return exec_err!("regexp_extract expects 2 or 3 arguments"); + } + + // DataFusion passes either scalars or arrays. Convert to arrays. + let len = args + .iter() + .map(|v| match v { + ColumnarValue::Array(a) => a.len(), + ColumnarValue::Scalar(_) => 1, + }) + .max() + .unwrap_or(1); + + let a0 = args[0].to_array(len)?; + let a1 = args[1].to_array(len)?; + + // Spark Catalyst: def this(s, r) = this(s, r, Literal(1)) + // When idx is omitted, default to group index 1. + let a2 = if args.len() == 3 { + args[2].to_array(len)? + } else { + // Default idx = 1, matching Spark's behavior + Arc::new(Int32Array::from(vec![1; len])) as ArrayRef + }; + + let out: ArrayRef = regexp_extract(&[a0, a1, a2])?; + Ok(ColumnarValue::Array(out)) + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} + +/// Helper to build args for tests and external callers. +pub fn regexp_extract(args: &[ArrayRef]) -> Result { + if args.len() != 3 { + return exec_err!("regexp_extract expects 3 arguments"); + } + + match args[0].data_type() { + // TODO: DataType::Utf8View => regexp_extract_utf8_view(args), + DataType::Utf8 => regexp_extract_utf8(args), + // TODO: DataType::LargeUtf8 => regexp_extract_large_utf8(args), + other => exec_err!("regexp_extract unsupported input type {other:?}"), + } +} + +fn regexp_extract_utf8(args: &[ArrayRef]) -> Result { + let values = &args[0]; + let pattern = &args[1]; + let index = &args[2]; + + let values_array = values + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("arg0 must be Utf8".to_string()))?; + let pattern_array = pattern + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("arg1 must be Utf8".to_string()))?; + let index_array = index + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("arg2 must be Int32".to_string()))?; + + let mut out = StringBuilder::new(); + + // Now iterate over the values, pattern, and index arrays and extract the matches + + for i in 0..values_array.len() { + if values_array.is_null(i) || pattern_array.is_null(i) || index_array.is_null(i) { + out.append_null(); + continue; + } + + let value = values_array.value(i); + let pattern = pattern_array.value(i); + let index = index_array.value(i); + + if index < 0 { + return exec_err!( + "The value of idx in regexp_extract must be non-negative, but got {}", + index + ); + } + + let group_index = index as usize; + + let regex = + Regex::new(pattern).map_err(|e| ArrowError::ComputeError(e.to_string()))?; + + // Validate group index doesn't exceed the number of capture groups + let num_groups = regex.captures_len(); + if group_index >= num_groups { + return exec_err!( + "Regex group index {} exceeds the number of groups {} in pattern '{}'", + group_index, + num_groups - 1, + pattern + ); + } + + if let Some(cap) = regex.captures(value) { + // cap.get() returns None for unmatched optional groups like (b)? + // Spark returns empty string in that case + match cap.get(group_index) { + Some(m) => out.append_value(m.as_str()), + None => out.append_value(""), + } + } else { + // No match at all — Spark returns empty string, not null + out.append_value(""); + } + } + + Ok(Arc::new(out.finish())) +} + +#[cfg(test)] +mod tests { + use super::regexp_extract; + use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; + use std::sync::Arc; + + /// Helper: call regexp_extract with single-element arrays and return the result string. + fn extract_one(value: &str, pattern: &str, idx: i32) -> Option { + let values = Arc::new(StringArray::from(vec![value])); + let patterns = Arc::new(StringArray::from(vec![pattern])); + let indices = Arc::new(Int32Array::from(vec![idx])); + let result = regexp_extract(&[values, patterns, indices]).unwrap(); + let arr = result + .as_any() + .downcast_ref::() + .expect("expected StringArray"); + if arr.is_null(0) { + None + } else { + Some(arr.value(0).to_string()) + } + } + + // ----------------------------------------------------------------------- + // Tests derived from the PySpark regexp_extract documentation: + // https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html + // ----------------------------------------------------------------------- + + /// PySpark example 1: + /// regexp_extract('100-200', r'(\d+)-(\d+)', 1) → '100' + #[test] + fn test_spark_example_extract_first_group() { + assert_eq!( + extract_one("100-200", r"(\d+)-(\d+)", 1), + Some("100".to_string()) + ); + } + + /// Also verify extracting the second capture group: + /// regexp_extract('100-200', r'(\d+)-(\d+)', 2) → '200' + #[test] + fn test_spark_example_extract_second_group() { + assert_eq!( + extract_one("100-200", r"(\d+)-(\d+)", 2), + Some("200".to_string()) + ); + } + + /// PySpark example 2 — no match returns empty string: + /// regexp_extract('foo', r'(\d+)', 1) → '' + #[test] + fn test_spark_example_no_match_returns_empty() { + assert_eq!(extract_one("foo", r"(\d+)", 1), Some("".to_string())); + } + + /// PySpark example 3 — unmatched optional group returns empty string: + /// regexp_extract('aaaac', '(a+)(b)?(c)', 2) → '' + #[test] + fn test_spark_example_optional_group_not_matched() { + assert_eq!( + extract_one("aaaac", r"(a+)(b)?(c)", 2), + Some("".to_string()) + ); + } + + // ----------------------------------------------------------------------- + // Additional coverage + // ----------------------------------------------------------------------- + + /// idx=0 returns the entire match. + #[test] + fn test_idx_zero_returns_entire_match() { + assert_eq!( + extract_one("100-200", r"(\d+)-(\d+)", 0), + Some("100-200".to_string()) + ); + } + + /// Verify extracting groups from partially-matching optional groups. + /// regexp_extract('aaaac', '(a+)(b)?(c)', 1) → 'aaaa' + #[test] + fn test_optional_group_extract_first() { + assert_eq!( + extract_one("aaaac", r"(a+)(b)?(c)", 1), + Some("aaaa".to_string()) + ); + } + + /// regexp_extract('aaaac', '(a+)(b)?(c)', 3) → 'c' + #[test] + fn test_optional_group_extract_third() { + assert_eq!( + extract_one("aaaac", r"(a+)(b)?(c)", 3), + Some("c".to_string()) + ); + } + + /// Negative group index should error. + #[test] + fn test_negative_index_errors() { + let values = Arc::new(StringArray::from(vec!["abc"])); + let patterns = Arc::new(StringArray::from(vec!["(a)"])); + let indices = Arc::new(Int32Array::from(vec![-1])); + let err = regexp_extract(&[values, patterns, indices]) + .expect_err("negative index should error"); + let msg = err.to_string(); + assert!( + msg.contains("non-negative"), + "unexpected error message: {msg}" + ); + } + + /// Group index exceeding the number of groups should error. + #[test] + fn test_group_index_out_of_bounds() { + let values = Arc::new(StringArray::from(vec!["abc"])); + let patterns = Arc::new(StringArray::from(vec!["(a)"])); + // pattern has 1 capture group, so valid indices are 0 and 1; 2 is out of bounds + let indices = Arc::new(Int32Array::from(vec![2])); + let err = regexp_extract(&[values, patterns, indices]) + .expect_err("out-of-bounds index should error"); + let msg = err.to_string(); + assert!(msg.contains("exceeds"), "unexpected error message: {msg}"); + } + + /// Multiple rows processed in a single batch. + #[test] + fn test_batch_multiple_rows() { + let values = Arc::new(StringArray::from(vec![ + "100-200", + "foo", + "aaaac", + "hello-world", + ])); + let patterns = Arc::new(StringArray::from(vec![ + r"(\d+)-(\d+)", + r"(\d+)", + r"(a+)(b)?(c)", + r"(\w+)-(\w+)", + ])); + let indices = Arc::new(Int32Array::from(vec![1, 1, 2, 2])); + + let result = regexp_extract(&[values, patterns, indices]).unwrap(); + let arr = result + .as_any() + .downcast_ref::() + .expect("expected StringArray"); + + assert_eq!(arr.value(0), "100"); // first group of '100-200' + assert_eq!(arr.value(1), ""); // no match in 'foo' + assert_eq!(arr.value(2), ""); // optional (b)? not matched + assert_eq!(arr.value(3), "world"); // second group of 'hello-world' + } + + /// Null input produces null output. + #[test] + fn test_null_input_produces_null() { + let values = Arc::new(StringArray::from(vec![None as Option<&str>])); + let patterns = Arc::new(StringArray::from(vec![Some(r"(\d+)")])); + let indices = Arc::new(Int32Array::from(vec![1])); + + let result = regexp_extract(&[values, patterns, indices]).unwrap(); + let arr = result + .as_any() + .downcast_ref::() + .expect("expected StringArray"); + + assert!(arr.is_null(0)); + } + + /// Empty string input matches empty pattern group 0. + #[test] + fn test_empty_string_with_matching_pattern() { + // Pattern .* matches empty string; group 0 is the entire match "" + assert_eq!(extract_one("", ".*", 0), Some("".to_string())); + } + + // ----------------------------------------------------------------------- + // Tests for 2-arg form (idx defaults to 1, matching Spark's Catalyst + // `def this(s, r) = this(s, r, Literal(1))`) + // ----------------------------------------------------------------------- + + /// Helper: call regexp_extract via invoke_with_args with 2 args (no idx), + /// verifying that idx defaults to 1. + fn extract_two_arg(value: &str, pattern: &str) -> Option { + use super::RegexpExtractFunc; + use arrow::datatypes::Field; + use datafusion_common::config::ConfigOptions; + use datafusion_expr::ColumnarValue; + use datafusion_expr::ScalarUDFImpl; + + let func = RegexpExtractFunc::new(); + let values = + ColumnarValue::Array(Arc::new(StringArray::from(vec![value])) as ArrayRef); + let patterns = + ColumnarValue::Array(Arc::new(StringArray::from(vec![pattern])) as ArrayRef); + + let result = func + .invoke_with_args(datafusion_expr::ScalarFunctionArgs { + args: vec![values, patterns], + arg_fields: vec![ + Arc::new(Field::new("str", arrow::datatypes::DataType::Utf8, true)), + Arc::new(Field::new( + "regexp", + arrow::datatypes::DataType::Utf8, + true, + )), + ], + number_rows: 1, + return_field: Arc::new(Field::new( + "result", + arrow::datatypes::DataType::Utf8, + true, + )), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(); + + match result { + ColumnarValue::Array(arr) => { + let arr = arr + .as_any() + .downcast_ref::() + .expect("expected StringArray"); + if arr.is_null(0) { + None + } else { + Some(arr.value(0).to_string()) + } + } + _ => panic!("expected Array"), + } + } + + /// 2-arg form: regexp_extract('100-200', r'(\d+)-(\d+)') → '100' + /// (idx defaults to 1, extracting first capture group) + #[test] + fn test_two_arg_defaults_to_group_1() { + assert_eq!( + extract_two_arg("100-200", r"(\d+)-(\d+)"), + Some("100".to_string()) + ); + } + + /// 2-arg form with no match returns empty string (same as Spark). + #[test] + fn test_two_arg_no_match_returns_empty() { + assert_eq!(extract_two_arg("foo", r"(\d+)"), Some("".to_string())); + } + + /// 2-arg form extracts first group from a single-group pattern. + #[test] + fn test_two_arg_single_group() { + assert_eq!( + extract_two_arg("hello world", r"(\w+)"), + Some("hello".to_string()) + ); + } + + /// Multiple rows with the same pattern and group index — the common case + /// where a single regex is applied across an entire column. + #[test] + fn test_batch_same_pattern() { + let values = Arc::new(StringArray::from(vec![ + "2024-01-15", + "2023-12-25", + "no-date-here", + "2025-06-30", + "", + ])); + // Same pattern repeated for every row + let patterns = Arc::new(StringArray::from(vec![ + r"(\d{4})-(\d{2})-(\d{2})"; + 5 + ])); + // Same group index (1 = year) for every row + let indices = Arc::new(Int32Array::from(vec![1; 5])); + + let result = regexp_extract(&[values, patterns, indices]).unwrap(); + let arr = result + .as_any() + .downcast_ref::() + .expect("expected StringArray"); + + assert_eq!(arr.value(0), "2024"); + assert_eq!(arr.value(1), "2023"); + assert_eq!(arr.value(2), ""); // no match → empty string + assert_eq!(arr.value(3), "2025"); + assert_eq!(arr.value(4), ""); // empty input → no match → empty string + } +} diff --git a/datafusion/sqllogictest/test_files/regexp/README.md b/datafusion/sqllogictest/test_files/regexp/README.md index 7e5efc5b5ddf2..4440661e72177 100644 --- a/datafusion/sqllogictest/test_files/regexp/README.md +++ b/datafusion/sqllogictest/test_files/regexp/README.md @@ -30,6 +30,7 @@ regexp/ - regexp_count.slt // Tests for regexp_count function - regexp_match.slt // Tests for regexp_match function - regexp_replace.slt // Tests for regexp_replace function + - regexp_extract.slt // Tests for regexp_extract function ``` ## Tested Functions @@ -38,6 +39,8 @@ regexp/ 2. `regexp_count`: Count occurrences of a pattern in a string 3. `regexp_match`: Extract matching substrings 4. `regexp_replace`: Replace matched substrings +5. `regexp_extract`: Extract a specific group matched by the Java regex regexp, from the specified string column. + ## Test Data diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_extract.slt b/datafusion/sqllogictest/test_files/regexp/regexp_extract.slt new file mode 100644 index 0000000000000..7dfb25752c48e --- /dev/null +++ b/datafusion/sqllogictest/test_files/regexp/regexp_extract.slt @@ -0,0 +1,207 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Import common test data +include ./init_data.slt.part + +# ----------------------------------------------------------------------- +# Tests derived from the PySpark regexp_extract documentation: +# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html +# ----------------------------------------------------------------------- + +# PySpark example 1: extract first capture group +# regexp_extract('100-200', r'(\d+)-(\d+)', 1) → '100' +query T +SELECT regexp_extract('100-200', '(\d+)-(\d+)', CAST(1 AS INT)); +---- +100 + +# Extract second capture group +# regexp_extract('100-200', r'(\d+)-(\d+)', 2) → '200' +query T +SELECT regexp_extract('100-200', '(\d+)-(\d+)', CAST(2 AS INT)); +---- +200 + +# PySpark example 2: no match returns empty string +# regexp_extract('foo', r'(\d+)', 1) → '' +query T +SELECT regexp_extract('foo', '(\d+)', CAST(1 AS INT)); +---- +(empty) + +# PySpark example 3: unmatched optional group returns empty string +# regexp_extract('aaaac', '(a+)(b)?(c)', 2) → '' +query T +SELECT regexp_extract('aaaac', '(a+)(b)?(c)', CAST(2 AS INT)); +---- +(empty) + +# ----------------------------------------------------------------------- +# Additional coverage +# ----------------------------------------------------------------------- + +# idx=0 returns the entire match +query T +SELECT regexp_extract('100-200', '(\d+)-(\d+)', CAST(0 AS INT)); +---- +100-200 + +# Extract first group from partially-matching optional groups +# regexp_extract('aaaac', '(a+)(b)?(c)', 1) → 'aaaa' +query T +SELECT regexp_extract('aaaac', '(a+)(b)?(c)', CAST(1 AS INT)); +---- +aaaa + +# Extract third group from partially-matching optional groups +# regexp_extract('aaaac', '(a+)(b)?(c)', 3) → 'c' +query T +SELECT regexp_extract('aaaac', '(a+)(b)?(c)', CAST(3 AS INT)); +---- +c + +# Negative group index should error +query error The value of idx in regexp_extract must be non\-negative, but got \-1 +SELECT regexp_extract('abc', '(a)', CAST(-1 AS INT)); + +# Group index exceeding the number of groups should error +query error Regex group index 2 exceeds the number of groups 1 in pattern '\(a\)' +SELECT regexp_extract('abc', '(a)', CAST(2 AS INT)); + +# Null input produces null output +query T +SELECT regexp_extract(CAST(NULL AS VARCHAR), '(\d+)', CAST(1 AS INT)); +---- +NULL + +# Empty string input matches empty pattern group 0 +query T +SELECT regexp_extract('', '.*', CAST(0 AS INT)); +---- +(empty) + +# ----------------------------------------------------------------------- +# Tests for 2-arg form (idx defaults to 1, matching Spark's Catalyst +# `def this(s, r) = this(s, r, Literal(1))`) +# ----------------------------------------------------------------------- + +# 2-arg form: regexp_extract('100-200', r'(\d+)-(\d+)') → '100' +# idx defaults to 1, extracting first capture group +query T +SELECT regexp_extract('100-200', '(\d+)-(\d+)'); +---- +100 + +# 2-arg form with no match returns empty string +query T +SELECT regexp_extract('foo', '(\d+)'); +---- +(empty) + +# 2-arg form extracts first group from a single-group pattern +query T +SELECT regexp_extract('hello world', '(\w+)'); +---- +hello + +# ----------------------------------------------------------------------- +# Batch tests: multiple rows processed in a single query +# ----------------------------------------------------------------------- + +statement ok +CREATE TABLE t_extract (val varchar, pat varchar, idx int) AS VALUES + ('100-200', '(\d+)-(\d+)', 1), + ('foo', '(\d+)', 1), + ('aaaac', '(a+)(b)?(c)', 2), + ('hello-world', '(\w+)-(\w+)', 2); + +query T +SELECT regexp_extract(val, pat, idx) +FROM t_extract +ORDER BY val; +---- +100 +(empty) +(empty) +world + +statement ok +DROP TABLE t_extract; + +# ----------------------------------------------------------------------- +# Batch test: same pattern applied across an entire column (common case) +# ----------------------------------------------------------------------- + +statement ok +CREATE TABLE t_dates (val varchar) AS VALUES + ('2024-01-15'), + ('2023-12-25'), + ('no-date-here'), + ('2025-06-30'), + (''); + +# Extract the year (group 1) using the same date pattern +query T +SELECT regexp_extract(val, '(\d{4})-(\d{2})-(\d{2})', CAST(1 AS INT)) +FROM t_dates +ORDER BY val; +---- +(empty) +2023 +2024 +2025 +(empty) + +statement ok +DROP TABLE t_dates; + +# ----------------------------------------------------------------------- +# Null handling with batch data +# ----------------------------------------------------------------------- + +statement ok +CREATE TABLE t_nulls (val varchar, pat varchar, idx int) AS VALUES + (NULL, '(\d+)', 1), + ('abc', NULL, 1), + ('abc', '(a)', NULL); + +query T +SELECT regexp_extract(val, pat, idx) +FROM t_nulls +ORDER BY val; +---- +NULL +NULL +NULL + +statement ok +DROP TABLE t_nulls; + +# ----------------------------------------------------------------------- +# Pattern with no capture groups (idx=0 returns entire match) +# ----------------------------------------------------------------------- + +query T +SELECT regexp_extract('xyz', 'abc', CAST(0 AS INT)); +---- +(empty) + +query T +SELECT regexp_extract('abc', 'abc', CAST(0 AS INT)); +---- +abc diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index e09c4cb7cbc32..f53e767261fb3 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2189,6 +2189,7 @@ regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) The following regular expression functions are supported: - [regexp_count](#regexp_count) +- [regexp_extract](#regexp_extract) - [regexp_instr](#regexp_instr) - [regexp_like](#regexp_like) - [regexp_match](#regexp_match) @@ -2225,6 +2226,32 @@ regexp_count(str, regexp[, start, flags]) +---------------------------------------------------------------+ ``` +### `regexp_extract` + +Extract a specific group matched by the [regular expression](https://docs.rs/regex/latest/regex/#syntax), from the specified string column. +If the regex did not match, or the specified group did not match, an empty string is returned. + +```sql +regexp_extract(str, pattern[, groupIndex]) +``` + +#### Arguments + +- **str**: String expression to operate on. Column or column name. +- **pattern**: Regular expression to operate on. Can be a constant. +- **groupIndex**: Match group ID. If not specified, then it's assummed to be 1. + +#### Example + +```sql +> select regexp_extract('str', '(\d+)-(\d+)'); ++-------+-----------------------------------+ +| str|regexp_extract(str, (\d+)-(\d+), 1)| ++-------+-----------------------------------+ +|100-200| 100| ++-------+-----------------------------------+ +``` + ### `regexp_instr` Returns the position in a string where the specified occurrence of a POSIX regular expression is located.