Skip to content

Commit

Permalink
[SPARK-45076][PS] Switch to built-in repeat function
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Switch to built-in `repeat` function

### Why are the changes needed?

apache#42794 make `repeat` support column-typed `n`, so we don't need this PS-specific function any more

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
CI

### Was this patch authored or co-authored using generative AI tooling?
NO

Closes apache#42812 from zhengruifeng/ps_replace_repeat.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
zhengruifeng committed Sep 5, 2023
1 parent 57c92fb commit a2c247b
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 19 deletions.
3 changes: 1 addition & 2 deletions python/pyspark/pandas/data_type_ops/num_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
_is_valid_for_logical_operator,
_is_boolean_type,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type
from pyspark.sql import functions as F
from pyspark.sql import Column as PySparkColumn
Expand Down Expand Up @@ -245,7 +244,7 @@ def pretty_name(self) -> str:
def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
return column_op(SF.repeat)(right, left)
return column_op(F.repeat)(right, left)

if not is_valid_operand_for_numeric_arithmetic(right):
raise TypeError("Multiplication can not be applied to given types.")
Expand Down
7 changes: 3 additions & 4 deletions python/pyspark/pandas/data_type_ops/string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
_as_string_type,
_sanitize_list_like,
)
from pyspark.pandas.spark import functions as SF
from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
from pyspark.sql.types import BooleanType

Expand Down Expand Up @@ -67,15 +66,15 @@ def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
return cast(
SeriesOrIndex,
left._with_new_scol(
SF.repeat(left.spark.column, right), field=left._internal.data_fields[0]
F.repeat(left.spark.column, right), field=left._internal.data_fields[0]
),
)
elif (
isinstance(right, IndexOpsMixin)
and isinstance(right.spark.data_type, IntegralType)
and not isinstance(right.dtype, CategoricalDtype)
):
return column_op(SF.repeat)(left, right)
return column_op(F.repeat)(left, right)
else:
raise TypeError("Multiplication can not be applied to given types.")

Expand All @@ -97,7 +96,7 @@ def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
return cast(
SeriesOrIndex,
left._with_new_scol(
SF.repeat(left.spark.column, right), field=left._internal.data_fields[0]
F.repeat(left.spark.column, right), field=left._internal.data_fields[0]
),
)
else:
Expand Down
11 changes: 0 additions & 11 deletions python/pyspark/pandas/spark/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
"""
Additional Spark functions used in pandas-on-Spark.
"""
from typing import Union

from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.column import Column

# For supporting Spark Connect
Expand Down Expand Up @@ -135,14 +132,6 @@ def covar(col1: Column, col2: Column, ddof: int) -> Column:
return Column(sc._jvm.PythonSQLUtils.pandasCovar(col1._jc, col2._jc, ddof))


def repeat(col: Column, n: Union[int, Column]) -> Column:
"""
Repeats a string column n times, and returns it as a new string column.
"""
_n = F.lit(n) if isinstance(n, int) else n
return F.call_udf("repeat", col, _n)


def ewm(col: Column, alpha: float, ignore_na: bool) -> Column:
if is_remote():
from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
Expand Down
3 changes: 1 addition & 2 deletions python/pyspark/pandas/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from pyspark.sql.functions import pandas_udf

import pyspark.pandas as ps
from pyspark.pandas.spark import functions as SF


class StringMethods:
Expand Down Expand Up @@ -1506,7 +1505,7 @@ def repeat(self, repeats: int) -> "ps.Series":
"""
if not isinstance(repeats, int):
raise TypeError("repeats expects an int parameter")
return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats))
return self._data.spark.transform(lambda c: F.repeat(col=c, n=repeats))

def replace(
self,
Expand Down

0 comments on commit a2c247b

Please sign in to comment.