apache · nguyen1hc · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
@@ -19,6 +19,7 @@
 String functions on pandas-on-Spark Series
 """
 
+import re
 from functools import wraps
 from typing import (
     Any,
@@ -1116,6 +1117,12 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series":
             All non-overlapping matches of pattern or regular expression in
             each string of this Series.
 
+        Notes
+        -----
+        For regular expressions with more than one capture group, pandas-on-Spark
+        returns nested lists instead of pandas' tuple matches because Spark SQL
+        does not have a tuple type.
+
         Examples
         --------
         >>> s = ps.Series(['Lion', 'Monkey', 'Rabbit'])
@@ -1174,14 +1181,23 @@ def findall(self, pat: str, flags: int = 0) -> "ps.Series":
         2    [b, b]
         dtype: object
         """
+        num_groups = re.compile(pat, flags=flags).groups
         str_dtype = is_str_dtype(self._data.dtype)
+        if num_groups > 1:
+            return_type = ArrayType(ArrayType(StringType(), containsNull=True), containsNull=True)
+        else:
+            return_type = ArrayType(StringType(), containsNull=True)
 
         # type hint does not support to specify array type yet.
-        @pandas_udf(  # type: ignore[call-overload]
-            returnType=ArrayType(StringType(), containsNull=True)
-        )
+        @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
             ret = s.str.findall(pat, flags)
+            if num_groups > 1:
+                ret = ret.map(
+                    lambda matches: [list(match) for match in matches]
+                    if isinstance(matches, list)
+                    else matches
+                )
             if str_dtype:
                 # ArrayType does not support NaN, so replace with None
                 ret = ret.replace(np.nan, None)

diff --git a/python/pyspark/pandas/tests/series/test_string_ops_adv.py b/python/pyspark/pandas/tests/series/test_string_ops_adv.py
@@ -86,6 +86,20 @@ def test_string_findall(self):
                 lambda x: x.str.findall("wh.*", flags=re.IGNORECASE), self.pser, ignore_null=True
             )
 
+        pser = pd.Series(["abc-123 def-456", "no match", None])
+        pattern = "([a-z]+)-([0-9]+)"
+
+        def normalize_matches(matches):  # type: ignore[no-untyped-def]
+            if isinstance(matches, (list, np.ndarray)):
+                return [list(match) for match in matches]
+            return matches
+
+        expected = pser.str.findall(pattern).map(normalize_matches)
+        actual = ps.from_pandas(pser).str.findall(pattern).to_pandas()
+        self.assertIsNone(actual.iloc[-1])
+        actual = actual.map(normalize_matches)
+        self.assert_eq(actual, expected)
+
     def test_string_index(self):
         pser = pd.Series(["tea", "eat"])
         self.check_func_on_series(lambda x: x.str.index("ea"), pser)