From 92a34132838f035d35c3fac06585eb06b96ad2fb Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Mon, 15 Jun 2026 21:38:58 +0700
Subject: [PATCH 1/6] [SPARK-42751][PS] Support findall with capture groups

---
 python/pyspark/pandas/strings.py               | 18 +++++++++++++++---
 .../pandas/tests/series/test_string_ops_adv.py |  9 +++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index b29482d32d784..b24c7e5c00033 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -19,6 +19,7 @@
 String functions on pandas-on-Spark Series
 """
 
+import re
 from functools import wraps
 from typing import (
     Any,
@@ -1174,14 +1175,25 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series":
         2    [b, b]
         dtype: object
         """
+        num_groups = re.compile(pat, flags=flags).groups
         str_dtype = is_str_dtype(self._data.dtype)
+        if num_groups > 1:
+            return_type = ArrayType(
+                ArrayType(StringType(), containsNull=True), containsNull=True
+            )
+        else:
+            return_type = ArrayType(StringType(), containsNull=True)
 
         # type hint does not support to specify array type yet.
-        @pandas_udf(  # type: ignore[call-overload]
-            returnType=ArrayType(StringType(), containsNull=True)
-        )
+        @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
             ret = s.str.findall(pat, flags)
+            if num_groups > 1:
+                ret = ret.map(
+                    lambda matches: [list(match) for match in matches]
+                    if isinstance(matches, list)
+                    else matches
+                )
             if str_dtype:
                 # ArrayType does not support NaN, so replace with None
                 ret = ret.replace(np.nan, None)
diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index 9835ca1a6e4ea..ecb718ecf6474 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -86,6 +86,15 @@ def test_string_findall(self):
                 lambda x: x.str.findall("wh.*", flags=re.IGNORECASE), self.pser, ignore_null=True
             )
 
+        pser = pd.Series(["abc-123 def-456", "no match"])
+        expected = pser.str.findall("([a-z]+)-([0-9]+)").map(
+            lambda matches: [list(match) for match in matches]
+        )
+        self.assert_eq(
+            ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)"),
+            expected,
+        )
+
     def test_string_index(self):
         pser = pd.Series(["tea", "eat"])
         self.check_func_on_series(lambda x: x.str.index("ea"), pser)

From d3fb798ff51f8798884d83e831bfe4e70f9eaa04 Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Tue, 16 Jun 2026 07:49:42 +0700
Subject: [PATCH 2/6] [SPARK-42751][PS][DOCS] Document findall capture group
 output

---
 python/pyspark/pandas/strings.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index b24c7e5c00033..e958ec20019c3 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -1117,6 +1117,12 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series":
             All non-overlapping matches of pattern or regular expression in
             each string of this Series.
 
+        Notes
+        -----
+        For regular expressions with more than one capture group, pandas-on-Spark
+        returns nested lists instead of pandas' tuple matches because Spark SQL
+        does not have a tuple type.
+
         Examples
         --------
         >>> s = ps.Series(['Lion', 'Monkey', 'Rabbit'])

From 4c84a241cdad38cb5ca30a3b6a81ddfa5a1a7259 Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Tue, 16 Jun 2026 12:43:33 +0700
Subject: [PATCH 3/6] [SPARK-42751][PS][TESTS] Fix findall capture group test
 assertion

---
 python/pyspark/pandas/strings.py                          | 4 +---
 python/pyspark/pandas/tests/series/test_string_ops_adv.py | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
index e958ec20019c3..5e2ce824ac97d 100644
--- a/python/pyspark/pandas/strings.py
+++ b/python/pyspark/pandas/strings.py
@@ -1184,9 +1184,7 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series":
         num_groups = re.compile(pat, flags=flags).groups
         str_dtype = is_str_dtype(self._data.dtype)
         if num_groups > 1:
-            return_type = ArrayType(
-                ArrayType(StringType(), containsNull=True), containsNull=True
-            )
+            return_type = ArrayType(ArrayType(StringType(), containsNull=True), containsNull=True)
         else:
             return_type = ArrayType(StringType(), containsNull=True)
 
diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index ecb718ecf6474..669ca70d14a45 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -91,8 +91,8 @@ def test_string_findall(self):
             lambda matches: [list(match) for match in matches]
         )
         self.assert_eq(
-            ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)"),
-            expected,
+            ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)").apply(str),
+            expected.apply(str),
         )
 
     def test_string_index(self):

From ef9a01bbd958d4306083785246d81dd7d3de2728 Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Tue, 16 Jun 2026 14:21:16 +0700
Subject: [PATCH 4/6] [SPARK-42751][PS][TESTS] Normalize findall capture group
 assertion

---
 .../pandas/tests/series/test_string_ops_adv.py     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index 669ca70d14a45..229b23151a83f 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -87,13 +87,15 @@ def test_string_findall(self):
             )
 
         pser = pd.Series(["abc-123 def-456", "no match"])
-        expected = pser.str.findall("([a-z]+)-([0-9]+)").map(
-            lambda matches: [list(match) for match in matches]
-        )
-        self.assert_eq(
-            ps.from_pandas(pser).str.findall("([a-z]+)-([0-9]+)").apply(str),
-            expected.apply(str),
+        pattern = "([a-z]+)-([0-9]+)"
+        expected = pser.str.findall(pattern).map(lambda matches: [list(match) for match in matches])
+        actual = (
+            ps.from_pandas(pser)
+            .str.findall(pattern)
+            .to_pandas()
+            .map(lambda matches: [list(match) for match in matches])
         )
+        self.assert_eq(actual, expected)
 
     def test_string_index(self):
         pser = pd.Series(["tea", "eat"])

From 94626ddbba18cf015c837e9259b154fa4e99b044 Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Thu, 18 Jun 2026 09:31:12 +0700
Subject: [PATCH 5/6] [SPARK-42751][PS][TESTS] Cover null findall capture
 groups

---
 .../pandas/tests/series/test_string_ops_adv.py    | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index 229b23151a83f..fda4febddc662 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -86,15 +86,14 @@ def test_string_findall(self):
                 lambda x: x.str.findall("wh.*", flags=re.IGNORECASE), self.pser, ignore_null=True
             )
 
-        pser = pd.Series(["abc-123 def-456", "no match"])
+        pser = pd.Series(["abc-123 def-456", "no match", None])
         pattern = "([a-z]+)-([0-9]+)"
-        expected = pser.str.findall(pattern).map(lambda matches: [list(match) for match in matches])
-        actual = (
-            ps.from_pandas(pser)
-            .str.findall(pattern)
-            .to_pandas()
-            .map(lambda matches: [list(match) for match in matches])
-        )
+
+        def normalize_matches(matches):  # type: ignore[no-untyped-def]
+            return [list(match) for match in matches] if isinstance(matches, list) else matches
+
+        expected = pser.str.findall(pattern).map(normalize_matches)
+        actual = ps.from_pandas(pser).str.findall(pattern).to_pandas().map(normalize_matches)
         self.assert_eq(actual, expected)
 
     def test_string_index(self):

From 2234e4bcca6e76dd4dc34149d5e0b0c99347bdfb Mon Sep 17 00:00:00 2001
From: nguyen1hc <nguyen206hc@gmail.com>
Date: Thu, 18 Jun 2026 10:51:47 +0700
Subject: [PATCH 6/6] [SPARK-42751][PS][TESTS] Normalize null capture group
 results

---
 python/pyspark/pandas/tests/series/test_string_ops_adv.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
index fda4febddc662..0379f90a158ab 100644
--- a/python/pyspark/pandas/tests/series/test_string_ops_adv.py
+++ b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -90,10 +90,14 @@ def test_string_findall(self):
         pattern = "([a-z]+)-([0-9]+)"
 
         def normalize_matches(matches):  # type: ignore[no-untyped-def]
-            return [list(match) for match in matches] if isinstance(matches, list) else matches
+            if isinstance(matches, (list, np.ndarray)):
+                return [list(match) for match in matches]
+            return matches
 
         expected = pser.str.findall(pattern).map(normalize_matches)
-        actual = ps.from_pandas(pser).str.findall(pattern).to_pandas().map(normalize_matches)
+        actual = ps.from_pandas(pser).str.findall(pattern).to_pandas()
+        self.assertIsNone(actual.iloc[-1])
+        actual = actual.map(normalize_matches)
         self.assert_eq(actual, expected)
 
     def test_string_index(self):