[SPARK-44823][PYTHON] Update black to 23.9.1 and fix erroneous check

### What changes were proposed in this pull request? The pr aims to update black from 22.6.0 to 23.9.1 and fix erroneous check. ### Why are the changes needed? The date of `22.6.0` release is `Jun 28, 2022`, it has been over a year since now. (https://pypi.org/project/black/23.7.0/#history) Make PySpark's code style more in line with the latest version of Black's requirements. Release notes: - 23.9.1: https://github.com/psf/black/blob/main/CHANGES.md#2391 <img width="968" alt="image" src="https://github.com/apache/spark/assets/15246973/2d7235b8-c846-45a0-8e03-40625b1a1f71"> - 23.9.0: https://github.com/psf/black/blob/main/CHANGES.md#2390 <img width="465" alt="image" src="https://github.com/apache/spark/assets/15246973/225e6e56-79e1-4d36-a47f-26f7bfd4de3e"> - 23.7.0: https://github.com/psf/black/blob/main/CHANGES.md#2370 <img width="740" alt="image" src="https://github.com/apache/spark/assets/15246973/ec42aab0-1abe-43cf-af4e-7338a4f698e7"> - 23.3.0: https://github.com/psf/black/blob/main/CHANGES.md#2330 - 23.1.0: https://github.com/psf/black/blob/main/CHANGES.md#2310 <img width="648" alt="image" src="https://github.com/apache/spark/assets/15246973/493bec0e-e7be-4c31-8e01-81b9e729099b"> - 22.12.0: https://github.com/psf/black/blob/main/CHANGES.md#22120 - 22.10.0: https://github.com/psf/black/blob/main/CHANGES.md#22100 - 22.8.0: https://github.com/psf/black/blob/main/CHANGES.md#2280 <img width="726" alt="image" src="https://github.com/apache/spark/assets/15246973/f643c228-b4ed-4d5f-8668-0fcfb3ca6b65"> <img width="724" alt="image" src="https://github.com/apache/spark/assets/15246973/5c7dae77-ab6e-4e1c-a438-55ca0ce99f60"> <img width="582" alt="image" src="https://github.com/apache/spark/assets/15246973/4de07cda-1777-4186-953d-4657bb1e0ee3"> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. Closes apache#42507 from panbingkun/SPARK-44823. Authored-by: panbingkun <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
dongwoo6kim · Sep 18, 2023 · 5299e54 · 5299e54
1 parent 99a979d
commit 5299e54
Show file tree

Hide file tree

Showing 95 changed files with 312 additions and 382 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -685,7 +685,7 @@ jobs:
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         # Jinja2 3.0.0+ causes error when building with Sphinx.
         #   See also https://issues.apache.org/jira/browse/SPARK-35375.
-        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
+        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1'
         python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
     - name: Python linter
       run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python

diff --git a/dev/pyproject.toml b/dev/pyproject.toml
@@ -27,7 +27,7 @@ testpaths = [
 [tool.black]
 # When changing the version, we have to update
 # GitHub workflow version and dev/reformat-python
-required-version = "22.6.0"
+required-version = "23.9.1"
 line-length = 100
 target-version = ['py38']
 include = '\.pyi?$'

diff --git a/dev/reformat-python b/dev/reformat-python
@@ -22,7 +22,7 @@ FWDIR="$( cd "$DIR"/.. && pwd )"
 cd "$FWDIR"
 
 BLACK_BUILD="${PYTHON_EXECUTABLE} -m black"
-BLACK_VERSION="22.6.0"
+BLACK_VERSION="23.9.1"
 $PYTHON_EXECUTABLE -c 'import black' 2> /dev/null
 if [ $? -ne 0 ]; then
     echo "The Python library providing the 'black' module was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'."

diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -46,7 +46,7 @@ jira>=3.5.2
 PyGithub
 
 # pandas API on Spark Code formatter.
-black==22.6.0
+black==23.9.1
 py
 
 # Spark Connect (required)

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -37,7 +37,7 @@
 
 def setup_test_environ(environ):
     print("[info] Setup the following environment variables for tests: ")
-    for (k, v) in environ.items():
+    for k, v in environ.items():
         print("%s=%s" % (k, v))
         os.environ[k] = v
 
@@ -331,7 +331,6 @@ def run_scala_tests_maven(test_profiles):
 
 
 def run_scala_tests_sbt(test_modules, test_profiles):
-
     sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
     if not sbt_test_goals:

diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
@@ -190,7 +190,7 @@ def setExecutorEnv(
         elif key is not None:
             self.set("spark.executorEnv.{}".format(key), cast(str, value))
         elif pairs is not None:
-            for (k, v) in pairs:
+            for k, v in pairs:
                 self.set("spark.executorEnv.{}".format(k), v)
         return self
 
@@ -203,7 +203,7 @@ def setAll(self, pairs: List[Tuple[str, str]]) -> "SparkConf":
         pairs : iterable of tuples
             list of key-value pairs to set
         """
-        for (k, v) in pairs:
+        for k, v in pairs:
             self.set(k, v)
         return self
 

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -285,7 +285,7 @@ def _do_init(
         self.appName = self._conf.get("spark.app.name")
         self.sparkHome = self._conf.get("spark.home", None)
 
-        for (k, v) in self._conf.getAll():
+        for k, v in self._conf.getAll():
             if k.startswith("spark.executorEnv."):
                 varName = k[len("spark.executorEnv.") :]
                 self.environment[varName] = v

diff --git a/python/pyspark/instrumentation_utils.py b/python/pyspark/instrumentation_utils.py
@@ -31,7 +31,6 @@
 
 
 def _wrap_function(class_name: str, function_name: str, func: Callable, logger: Any) -> Callable:
-
     signature = inspect.signature(func)
 
     @functools.wraps(func)
@@ -91,7 +90,6 @@ def wrapper(self: Any) -> Any:
 def _wrap_missing_function(
     class_name: str, function_name: str, func: Callable, original: Any, logger: Any
 ) -> Any:
-
     if not hasattr(original, function_name):
         return func
 
@@ -110,7 +108,6 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 
 
 def _wrap_missing_property(class_name: str, property_name: str, prop: Any, logger: Any) -> Any:
-
     is_deprecated = prop.fget.__name__ == "deprecated_property"
 
     @property  # type: ignore[misc]

diff --git a/python/pyspark/join.py b/python/pyspark/join.py
@@ -44,7 +44,7 @@ def _do_python_join(rdd, other, numPartitions, dispatch):
 def python_join(rdd, other, numPartitions):
     def dispatch(seq):
         vbuf, wbuf = [], []
-        for (n, v) in seq:
+        for n, v in seq:
             if n == 1:
                 vbuf.append(v)
             elif n == 2:
@@ -57,7 +57,7 @@ def dispatch(seq):
 def python_right_outer_join(rdd, other, numPartitions):
     def dispatch(seq):
         vbuf, wbuf = [], []
-        for (n, v) in seq:
+        for n, v in seq:
             if n == 1:
                 vbuf.append(v)
             elif n == 2:
@@ -72,7 +72,7 @@ def dispatch(seq):
 def python_left_outer_join(rdd, other, numPartitions):
     def dispatch(seq):
         vbuf, wbuf = [], []
-        for (n, v) in seq:
+        for n, v in seq:
             if n == 1:
                 vbuf.append(v)
             elif n == 2:
@@ -87,7 +87,7 @@ def dispatch(seq):
 def python_full_outer_join(rdd, other, numPartitions):
     def dispatch(seq):
         vbuf, wbuf = [], []
-        for (n, v) in seq:
+        for n, v in seq:
             if n == 1:
                 vbuf.append(v)
             elif n == 2:

diff --git a/python/pyspark/ml/connect/evaluation.py b/python/pyspark/ml/connect/evaluation.py
@@ -28,7 +28,6 @@
 
 
 class _TorchMetricEvaluator(Evaluator):
-
     metricName: Param[str] = Param(
         Params._dummy(),
         "metricName",

diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py
@@ -30,7 +30,6 @@
 
 
 class DeepspeedTorchDistributor(TorchDistributor):
-
     _DEEPSPEED_SSL_CONF = "deepspeed.spark.distributor.ignoreSsl"
 
     def __init__(

diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
@@ -281,7 +281,6 @@ def simpleString(self) -> str:
 
 
 class Vector:
-
     __UDT__ = VectorUDT()
 
     """
@@ -1024,7 +1023,6 @@ def _equals(
 
 
 class Matrix:
-
     __UDT__ = MatrixUDT()
 
     """

diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
@@ -373,7 +373,6 @@ def getOrDefault(self, param: Param[T]) -> T:
         ...
 
     def getOrDefault(self, param: Union[str, Param[T]]) -> Union[Any, T]:
-
         """
         Gets the value of a param in the user-supplied param map or its
         default value. Raises an error if neither is set.

diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py
@@ -38,7 +38,6 @@
 
 class LogisticRegressionTest(SparkSessionTestCase):
     def test_binomial_logistic_regression_with_bound(self):
-
         df = self.spark.createDataFrame(
             [
                 (1.0, 1.0, Vectors.dense(0.0, 5.0)),
@@ -60,7 +59,6 @@ def test_binomial_logistic_regression_with_bound(self):
         self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4))
 
     def test_multinomial_logistic_regression_with_bound(self):
-
         data_path = "data/mllib/sample_multiclass_classification_data.txt"
         df = self.spark.read.format("libsvm").load(data_path)
 
@@ -84,7 +82,6 @@ def test_multinomial_logistic_regression_with_bound(self):
         )
 
     def test_logistic_regression_with_threshold(self):
-
         df = self.spark.createDataFrame(
             [
                 (1.0, 1.0, Vectors.dense(0.0, 5.0)),
@@ -121,7 +118,6 @@ def test_logistic_regression_with_threshold(self):
 
 class MultilayerPerceptronClassifierTest(SparkSessionTestCase):
     def test_raw_and_probability_prediction(self):
-
         data_path = "data/mllib/sample_multiclass_classification_data.txt"
         df = self.spark.read.format("libsvm").load(data_path)
 
@@ -375,7 +371,6 @@ def test_storage_levels(self):
 
 class GeneralizedLinearRegressionTest(SparkSessionTestCase):
     def test_tweedie_distribution(self):
-
         df = self.spark.createDataFrame(
             [
                 (1.0, Vectors.dense(0.0, 0.0)),
@@ -396,7 +391,6 @@ def test_tweedie_distribution(self):
         self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1e-4))
 
     def test_offset(self):
-
         df = self.spark.createDataFrame(
             [
                 (0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
@@ -417,7 +411,6 @@ def test_offset(self):
 
 class LinearRegressionTest(SparkSessionTestCase):
     def test_linear_regression_with_huber_loss(self):
-
         data_path = "data/mllib/sample_linear_regression_data.txt"
         df = self.spark.read.format("libsvm").load(data_path)
 

diff --git a/python/pyspark/ml/tests/test_dl_util.py b/python/pyspark/ml/tests/test_dl_util.py
@@ -26,7 +26,6 @@
 
 
 class TestFunctionPickler(unittest.TestCase):
-
     # Function that will be used to test pickling.
     @staticmethod
     def _test_function(x: float, y: float) -> float:

diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py
@@ -367,7 +367,6 @@ def test_vector_size_hint(self):
 
 class HashingTFTest(SparkSessionTestCase):
     def test_apply_binary_term_freqs(self):
-
         df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
         n = 10
         hashingTF = HashingTF()

diff --git a/python/pyspark/ml/tests/test_linalg.py b/python/pyspark/ml/tests/test_linalg.py
@@ -320,7 +320,6 @@ def test_norms(self):
 
 
 class VectorUDTTests(MLlibTestCase):
-
     dv0 = DenseVector([])
     dv1 = DenseVector([1.0, 2.0])
     sv0 = SparseVector(2, [], [])
@@ -367,7 +366,6 @@ def test_unwrap_udt(self):
 
 
 class MatrixUDTTests(MLlibTestCase):
-
     dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
     dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
     sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])

diff --git a/python/pyspark/ml/torch/distributor.py b/python/pyspark/ml/torch/distributor.py
@@ -896,7 +896,6 @@ def _run_training_on_pytorch_function(
         *args: Any,
         **kwargs: Any,
     ) -> Any:
-
         if not run_pytorch_file_fn:
             run_pytorch_file_fn = TorchDistributor._run_training_on_pytorch_file
 

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -184,7 +184,7 @@ def baseOn(self, *args: Union["ParamMap", Tuple[Param, Any]]) -> "ParamGridBuild
         if isinstance(args[0], dict):
             self.baseOn(*args[0].items())
         else:
-            for (param, value) in args:
+            for param, value in args:
                 self.addGrid(param, [value])
 
         return self

diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
@@ -738,7 +738,6 @@ def try_remote_functions(f: FuncT) -> FuncT:
 
     @functools.wraps(f)
     def wrapped(*args: Any, **kwargs: Any) -> Any:
-
         if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ:
             from pyspark.ml.connect import functions
 

diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
@@ -422,7 +422,6 @@ def __init__(self, java_model: Optional["JavaObject"] = None):
         """
         super(JavaModel, self).__init__(java_model)
         if java_model is not None:
-
             # SPARK-10931: This is a temporary fix to allow models to own params
             # from estimators. Eventually, these params should be in models through
             # using common base classes between estimators and models.

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
@@ -297,7 +297,6 @@ def simpleString(self) -> str:
 
 
 class Vector:
-
     __UDT__ = VectorUDT()
 
     """
@@ -1200,7 +1199,6 @@ def _equals(
 
 
 class Matrix:
-
     __UDT__ = MatrixUDT()
 
     """

diff --git a/python/pyspark/mllib/tests/test_feature.py b/python/pyspark/mllib/tests/test_feature.py
@@ -118,7 +118,6 @@ def test_binary_term_freqs(self):
 
 
 class DimensionalityReductionTests(MLlibTestCase):
-
     denseData = [
         Vectors.dense([0.0, 1.0, 2.0]),
         Vectors.dense([3.0, 4.0, 5.0]),

diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py
@@ -409,7 +409,6 @@ def test_ml_mllib_matrix_conversion(self):
 
 
 class VectorUDTTests(MLlibTestCase):
-
     dv0 = DenseVector([])
     dv1 = DenseVector([1.0, 2.0])
     sv0 = SparseVector(2, [], [])
@@ -471,7 +470,6 @@ def test_row_matrix_invalid_type(self):
 
 
 class MatrixUDTTests(MLlibTestCase):
-
     dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
     dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
     sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -7901,7 +7901,10 @@ def _swaplevel_columns(self, i: Union[int, Name], j: Union[int, Name]) -> Intern
                 )
 
         column_label_names = self._internal.column_label_names.copy()
-        column_label_names[i], column_label_names[j], = (
+        (
+            column_label_names[i],
+            column_label_names[j],
+        ) = (
             column_label_names[j],
             column_label_names[i],
         )
@@ -11306,7 +11309,6 @@ def rename(
         level: Optional[int] = None,
         errors: str = "ignore",
     ) -> Optional["DataFrame"]:
-
         """
         Alter axes labels.
         Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series
@@ -12976,7 +12978,6 @@ def align(
         if (
             axis is None or axis == 1
         ) and left._internal.column_labels != right._internal.column_labels:
-
             if left._internal.column_labels_level != right._internal.column_labels_level:
                 raise ValueError("cannot join with no overlapping index names")
 

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -2010,7 +2010,6 @@ def pandas_apply(pdf: pd.DataFrame, *a: Any, **k: Any) -> Any:
                 return_schema = StructType([field.struct_field for field in data_fields])
 
         def pandas_groupby_apply(pdf: pd.DataFrame) -> pd.DataFrame:
-
             if is_series_groupby:
                 pdf_or_ser = pdf.groupby(groupkey_names)[name].apply(pandas_apply, *args, **kwargs)
             else:

diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py
@@ -1050,7 +1050,6 @@ def _select_rows_by_slice(
             if (start is None and rows_sel.start is not None) or (
                 stop is None and rows_sel.stop is not None
             ):
-
                 inc = index_column.is_monotonic_increasing
                 if inc is False:
                     dec = index_column.is_monotonic_decreasing

diff --git a/python/pyspark/pandas/numpy_compat.py b/python/pyspark/pandas/numpy_compat.py
@@ -215,7 +215,6 @@ def maybe_dispatch_ufunc_to_spark_func(
         and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings)
         and kwargs.get("out") is None
     ):
-
         np_spark_map_func = unary_np_spark_mappings.get(op_name) or binary_np_spark_mappings.get(
             op_name
         )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,7 +30,6 @@


		class DeepspeedTorchDistributor(TorchDistributor):

		_DEEPSPEED_SSL_CONF = "deepspeed.spark.distributor.ignoreSsl"

		def __init__(
Expand Down
-Original file line number
+Diff line change
@@ Expand Up / @@ -281,7 +281,6 @@ def simpleString(self) -> str: @@
     class Vector:
         __UDT__ = VectorUDT()
         """
@@ Expand Down Expand Up / @@ -1024,7 +1023,6 @@ def _equals( @@
     class Matrix:
         __UDT__ = MatrixUDT()
         """
@@ Expand Down @@