From 92a34132838f035d35c3fac06585eb06b96ad2fb Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Mon, 15 Jun 2026 21:38:58 +0700 Subject: [PATCH 1/6] [SPARK-42751][PS] Support findall with capture groups --- python/pyspark/pandas/strings.py | 18 +++++++++++++++--- .../pandas/tests/series/test_string_ops_adv.py | 9 +++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index b29482d32d784..b24c7e5c00033 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -19,6 +19,7 @@ String functions on pandas-on-Spark Series """ +import re from functools import wraps from typing import ( Any, @@ -1174,14 +1175,25 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series": 2 [b, b] dtype: object """ + num_groups = re.compile(pat, flags=flags).groups str_dtype = is_str_dtype(self._data.dtype) + if num_groups > 1: + return_type = ArrayType( + ArrayType(StringType(), containsNull=True), containsNull=True + ) + else: + return_type = ArrayType(StringType(), containsNull=True) # type hint does not support to specify array type yet. - @pandas_udf( # type: ignore[call-overload] - returnType=ArrayType(StringType(), containsNull=True) - ) + @pandas_udf(returnType=return_type) # type: ignore[call-overload] def pudf(s: pd.Series) -> pd.Series: ret = s.str.findall(pat, flags) + if num_groups > 1: + ret = ret.map( + lambda matches: [list(match) for match in matches] + if isinstance(matches, list) + else matches + ) if str_dtype: # ArrayType does not support NaN, so replace with None ret = ret.replace(np.nan, None) diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index 9835ca1a6e4ea..ecb718ecf6474 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -86,6 +86,15 @@ def test_string_findall(self): lambda x: x.str.findall("wh.*", flags=re.IGNORECASE), self.pser, ignore_null=True ) + pser = pd.Series(["abc-123 def-456", "no match"]) + expected = pser.str.findall("([a-z]+)-([0-9]+)").map( + lambda matches: [list(match) for match in matches] + ) + self.assert_eq( + ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)"), + expected, + ) + def test_string_index(self): pser = pd.Series(["tea", "eat"]) self.check_func_on_series(lambda x: x.str.index("ea"), pser) From d3fb798ff51f8798884d83e831bfe4e70f9eaa04 Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Tue, 16 Jun 2026 07:49:42 +0700 Subject: [PATCH 2/6] [SPARK-42751][PS][DOCS] Document findall capture group output --- python/pyspark/pandas/strings.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index b24c7e5c00033..e958ec20019c3 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -1117,6 +1117,12 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series": All non-overlapping matches of pattern or regular expression in each string of this Series. + Notes + ----- + For regular expressions with more than one capture group, pandas-on-Spark + returns nested lists instead of pandas' tuple matches because Spark SQL + does not have a tuple type. + Examples -------- >>> s = ps.Series(['Lion', 'Monkey', 'Rabbit']) From 4c84a241cdad38cb5ca30a3b6a81ddfa5a1a7259 Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Tue, 16 Jun 2026 12:43:33 +0700 Subject: [PATCH 3/6] [SPARK-42751][PS][TESTS] Fix findall capture group test assertion --- python/pyspark/pandas/strings.py | 4 +--- python/pyspark/pandas/tests/series/test_string_ops_adv.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index e958ec20019c3..5e2ce824ac97d 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -1184,9 +1184,7 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series": num_groups = re.compile(pat, flags=flags).groups str_dtype = is_str_dtype(self._data.dtype) if num_groups > 1: - return_type = ArrayType( - ArrayType(StringType(), containsNull=True), containsNull=True - ) + return_type = ArrayType(ArrayType(StringType(), containsNull=True), containsNull=True) else: return_type = ArrayType(StringType(), containsNull=True) diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index ecb718ecf6474..669ca70d14a45 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -91,8 +91,8 @@ def test_string_findall(self): lambda matches: [list(match) for match in matches] ) self.assert_eq( - ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)"), - expected, + ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)").apply(str), + expected.apply(str), ) def test_string_index(self): From ef9a01bbd958d4306083785246d81dd7d3de2728 Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Tue, 16 Jun 2026 14:21:16 +0700 Subject: [PATCH 4/6] [SPARK-42751][PS][TESTS] Normalize findall capture group assertion --- .../pandas/tests/series/test_string_ops_adv.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index 669ca70d14a45..229b23151a83f 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -87,13 +87,15 @@ def test_string_findall(self): ) pser = pd.Series(["abc-123 def-456", "no match"]) - expected = pser.str.findall("([a-z]+)-([0-9]+)").map( - lambda matches: [list(match) for match in matches] - ) - self.assert_eq( - ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)").apply(str), - expected.apply(str), + pattern = "([a-z]+)-([0-9]+)" + expected = pser.str.findall(pattern).map(lambda matches: [list(match) for match in matches]) + actual = ( + ps.from_pandas(pser) + .str.findall(pattern) + .to_pandas() + .map(lambda matches: [list(match) for match in matches]) ) + self.assert_eq(actual, expected) def test_string_index(self): pser = pd.Series(["tea", "eat"]) From 94626ddbba18cf015c837e9259b154fa4e99b044 Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Thu, 18 Jun 2026 09:31:12 +0700 Subject: [PATCH 5/6] [SPARK-42751][PS][TESTS] Cover null findall capture groups --- .../pandas/tests/series/test_string_ops_adv.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index 229b23151a83f..fda4febddc662 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -86,15 +86,14 @@ def test_string_findall(self): lambda x: x.str.findall("wh.*", flags=re.IGNORECASE), self.pser, ignore_null=True ) - pser = pd.Series(["abc-123 def-456", "no match"]) + pser = pd.Series(["abc-123 def-456", "no match", None]) pattern = "([a-z]+)-([0-9]+)" - expected = pser.str.findall(pattern).map(lambda matches: [list(match) for match in matches]) - actual = ( - ps.from_pandas(pser) - .str.findall(pattern) - .to_pandas() - .map(lambda matches: [list(match) for match in matches]) - ) + + def normalize_matches(matches): # type: ignore[no-untyped-def] + return [list(match) for match in matches] if isinstance(matches, list) else matches + + expected = pser.str.findall(pattern).map(normalize_matches) + actual = ps.from_pandas(pser).str.findall(pattern).to_pandas().map(normalize_matches) self.assert_eq(actual, expected) def test_string_index(self): From 2234e4bcca6e76dd4dc34149d5e0b0c99347bdfb Mon Sep 17 00:00:00 2001 From: nguyen1hc Date: Thu, 18 Jun 2026 10:51:47 +0700 Subject: [PATCH 6/6] [SPARK-42751][PS][TESTS] Normalize null capture group results --- python/pyspark/pandas/tests/series/test_string_ops_adv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py index fda4febddc662..0379f90a158ab 100644 --- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py +++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py @@ -90,10 +90,14 @@ def test_string_findall(self): pattern = "([a-z]+)-([0-9]+)" def normalize_matches(matches): # type: ignore[no-untyped-def] - return [list(match) for match in matches] if isinstance(matches, list) else matches + if isinstance(matches, (list, np.ndarray)): + return [list(match) for match in matches] + return matches expected = pser.str.findall(pattern).map(normalize_matches) - actual = ps.from_pandas(pser).str.findall(pattern).to_pandas().map(normalize_matches) + actual = ps.from_pandas(pser).str.findall(pattern).to_pandas() + self.assertIsNone(actual.iloc[-1]) + actual = actual.map(normalize_matches) self.assert_eq(actual, expected) def test_string_index(self):