Skip to content

Commit

Permalink
[SPARK-32719][PYTHON] Add Flake8 check missing imports
Browse files Browse the repository at this point in the history
https://issues.apache.org/jira/browse/SPARK-32719

### What changes were proposed in this pull request?

Add a check to detect missing imports. This makes sure that if we use a specific class, it should be explicitly imported (not using a wildcard).

### Why are the changes needed?

To make sure that the quality of the Python code is up to standard.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Existing unit-tests and Flake8 static analysis

Closes apache#29563 from Fokko/fd-add-check-missing-imports.

Authored-by: Fokko Driesprong <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
  • Loading branch information
Fokko authored and HyukjinKwon committed Aug 31, 2020
1 parent 6dacba7 commit a1e459e
Show file tree
Hide file tree
Showing 38 changed files with 111 additions and 66 deletions.
4 changes: 3 additions & 1 deletion dev/create-release/generate-contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import re
import sys

from releaseutils import *
from releaseutils import tag_exists, raw_input, get_commits, yesOrNoPrompt, get_date, \
is_valid_author, capitalize_author, JIRA, find_components, translate_issue_type, \
translate_component, CORE_COMPONENT, contributors_file_name, nice_join

# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
Expand Down
12 changes: 10 additions & 2 deletions dev/create-release/translate-contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,15 @@
import os
import sys

from releaseutils import *
from releaseutils import JIRA, JIRAError, get_jira_name, Github, get_github_name, \
contributors_file_name, is_valid_author, raw_input, capitalize_author, yesOrNoPrompt

try:
import unidecode
except ImportError:
print("This tool requires the unidecode library to decode obscure github usernames")
print("Install using 'sudo pip install unidecode'")
sys.exit(-1)

# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
Expand Down Expand Up @@ -135,7 +143,7 @@ def generate_candidates(author, issues):
# Note that the candidate name may already be in unicode (JIRA returns this)
for i, (candidate, source) in enumerate(candidates):
try:
candidate = unicode(candidate, "UTF-8")
candidate = unicode(candidate, "UTF-8") # noqa: F821
except TypeError:
# already in unicode
pass
Expand Down
2 changes: 1 addition & 1 deletion dev/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ max-line-length=100
exclude=python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*

[flake8]
select = E901,E999,F821,F822,F823,F401
select = E901,E999,F821,F822,F823,F401,F405
exclude = python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
max-line-length = 100
2 changes: 1 addition & 1 deletion examples/src/main/python/sql/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

# $example on:programmatic_schema$
# Import data types
from pyspark.sql.types import *
from pyspark.sql.types import StringType, StructType, StructField
# $example off:programmatic_schema$


Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@
from pyspark.context import SparkContext
from pyspark.rdd import RDD, RDDBarrier
from pyspark.files import SparkFiles
from pyspark.status import StatusTracker, SparkJobInfo, SparkStageInfo
from pyspark.util import InheritableThread
from pyspark.storagelevel import StorageLevel
from pyspark.accumulators import Accumulator, AccumulatorParam
from pyspark.broadcast import Broadcast
from pyspark.serializers import MarshalSerializer, PickleSerializer
from pyspark.status import *
from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
from pyspark.profiler import Profiler, BasicProfiler
from pyspark.version import __version__ # noqa: F401
Expand Down
3 changes: 2 additions & 1 deletion python/pyspark/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
import threading

from pyspark import since
from pyspark.ml.param.shared import *
from pyspark.ml.common import inherit_doc
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasLabelCol, HasFeaturesCol, \
HasPredictionCol, Params
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType

Expand Down
11 changes: 8 additions & 3 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,24 @@
#

import operator
import sys
import uuid
import warnings
from abc import ABCMeta, abstractmethod, abstractproperty
from multiprocessing.pool import ThreadPool

from pyspark import keyword_only
from pyspark import keyword_only, since, SparkContext
from pyspark.ml import Estimator, Predictor, PredictionModel, Model
from pyspark.ml.param.shared import *
from pyspark.ml.param.shared import HasRawPredictionCol, HasProbabilityCol, HasThresholds, \
HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, \
HasAggregationDepth, HasThreshold, HasBlockSize, Param, Params, TypeConverters, \
HasElasticNetParam, HasSeed, HasStepSize, HasSolver, HasParallelism
from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
_TreeEnsembleModel, _RandomForestParams, _GBTParams, \
_HasVarianceImpurity, _TreeClassifierParams
from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel
from pyspark.ml.util import *
from pyspark.ml.base import _PredictorParams
from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary
from pyspark.ml.wrapper import JavaParams, \
JavaPredictor, JavaPredictionModel, JavaWrapper
from pyspark.ml.common import inherit_doc, _java2py, _py2java
Expand Down
7 changes: 5 additions & 2 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@
import warnings

from pyspark import since, keyword_only
from pyspark.ml.util import *
from pyspark.ml.param.shared import HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, \
HasAggregationDepth, HasWeightCol, HasTol, HasProbabilityCol, HasBlockSize, \
HasDistanceMeasure, HasCheckpointInterval, Param, Params, TypeConverters
from pyspark.ml.util import JavaMLWritable, JavaMLReadable, GeneralJavaMLWritable, \
HasTrainingSummary, SparkContext
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaWrapper
from pyspark.ml.param.shared import *
from pyspark.ml.common import inherit_doc, _java2py
from pyspark.ml.stat import MultivariateGaussian
from pyspark.sql import DataFrame
Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

from pyspark import since, keyword_only, SparkContext
from pyspark.ml.linalg import _convert_to_vector
from pyspark.ml.param.shared import *
from pyspark.ml.param.shared import HasThreshold, HasThresholds, HasInputCol, HasOutputCol, \
HasInputCols, HasOutputCols, HasHandleInvalid, HasRelativeError, HasFeaturesCol, HasLabelCol, \
HasSeed, HasNumFeatures, HasStepSize, HasMaxIter, TypeConverters, Param, Params
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm
from pyspark.ml.common import inherit_doc
Expand Down
8 changes: 5 additions & 3 deletions python/pyspark/ml/fpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
# limitations under the License.
#

from pyspark import keyword_only
import sys

from pyspark import keyword_only, since
from pyspark.sql import DataFrame
from pyspark.ml.util import *
from pyspark.ml.util import JavaMLWritable, JavaMLReadable
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
from pyspark.ml.param.shared import *
from pyspark.ml.param.shared import HasPredictionCol, Param, TypeConverters, Params

__all__ = ["FPGrowth", "FPGrowthModel", "PrefixSpan"]

Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/ml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os

from pyspark import keyword_only
from pyspark import keyword_only, since, SparkContext
from pyspark.ml.base import Estimator, Model, Transformer
from pyspark.ml.param import Param, Params
from pyspark.ml.util import *
from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \
DefaultParamsReader, DefaultParamsWriter, MLWriter, MLReader, JavaMLWritable
from pyspark.ml.wrapper import JavaParams, JavaWrapper
from pyspark.ml.common import inherit_doc, _java2py, _py2java

Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/ml/recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
import sys

from pyspark import since, keyword_only
from pyspark.ml.util import *
from pyspark.ml.param.shared import HasPredictionCol, HasBlockSize, HasMaxIter, HasRegParam, \
HasCheckpointInterval, HasSeed
from pyspark.ml.wrapper import JavaEstimator, JavaModel
from pyspark.ml.param.shared import *
from pyspark.ml.common import inherit_doc
from pyspark.ml.param import Params, TypeConverters, Param
from pyspark.ml.util import JavaMLWritable, JavaMLReadable


__all__ = ['ALS', 'ALSModel']
Expand Down
12 changes: 9 additions & 3 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,21 @@
# limitations under the License.
#

import sys

from abc import ABCMeta

from pyspark import keyword_only
from pyspark import keyword_only, since
from pyspark.ml import Predictor, PredictionModel
from pyspark.ml.base import _PredictorParams
from pyspark.ml.param.shared import *
from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol, \
Param, Params, TypeConverters, HasMaxIter, HasTol, HasFitIntercept, HasAggregationDepth, \
HasBlockSize, HasRegParam, HasSolver, HasStepSize, HasSeed, HasElasticNetParam, \
HasStandardization, HasLoss, HasVarianceCol
from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
_TreeEnsembleModel, _RandomForestParams, _GBTParams, _TreeRegressorParams
from pyspark.ml.util import *
from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary, \
GeneralJavaMLWritable
from pyspark.ml.wrapper import JavaEstimator, JavaModel, \
JavaPredictor, JavaPredictionModel, JavaWrapper
from pyspark.ml.common import inherit_doc
Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/ml/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
# limitations under the License.
#

from pyspark.ml.param.shared import *
from pyspark.ml.util import *
from pyspark import since
from pyspark.ml.param import Params
from pyspark.ml.param.shared import HasCheckpointInterval, HasSeed, HasWeightCol, Param, \
TypeConverters, HasMaxIter, HasStepSize, HasValidationIndicatorCol
from pyspark.ml.wrapper import JavaPredictionModel
from pyspark.ml.common import inherit_doc

Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
import itertools
from multiprocessing.pool import ThreadPool

import numpy as np

from pyspark import keyword_only
from pyspark import keyword_only, since, SparkContext
from pyspark.ml import Estimator, Model
from pyspark.ml.common import _py2java, _java2py
from pyspark.ml.param import Params, Param, TypeConverters
from pyspark.ml.param.shared import HasCollectSubModels, HasParallelism, HasSeed
from pyspark.ml.util import *
from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader
from pyspark.ml.wrapper import JavaParams
from pyspark.sql.functions import col, lit, rand, UserDefinedFunction
from pyspark.sql.types import BooleanType
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/stat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
Python package for statistical functions in MLlib.
"""

from pyspark.mllib.stat._statistics import *
from pyspark.mllib.stat._statistics import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.stat.distribution import MultivariateGaussian
from pyspark.mllib.stat.test import ChiSqTestResult
from pyspark.mllib.stat.KernelDensity import KernelDensity
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from pyspark import copy_func, since
from pyspark.context import SparkContext
from pyspark.sql.types import *
from pyspark.sql.types import DataType, StructField, StructType, IntegerType, StringType

__all__ = ["Column"]

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2
from pyspark.sql.streaming import DataStreamWriter
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.pandas.conversion import PandasConversionMixin
from pyspark.sql.pandas.map_ops import PandasMapOpsMixin

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pyspark.sql.column import Column, _to_seq
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

__all__ = ["GroupedData"]

Expand Down
3 changes: 2 additions & 1 deletion python/pyspark/sql/pandas/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from pyspark.rdd import _load_from_socket
from pyspark.sql.pandas.serializers import ArrowCollectSerializer
from pyspark.sql.types import IntegralType
from pyspark.sql.types import *
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \
DoubleType, BooleanType, TimestampType, StructType, DataType
from pyspark.traceback_utils import SCCallSiteSync


Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/sql/pandas/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
pandas instances during the type conversion.
"""

from pyspark.sql.types import *
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \
DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, ArrayType, \
StructType, StructField, BooleanType


def to_arrow_type(dt):
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/sql/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys

from py4j.java_gateway import JavaClass

from pyspark import RDD, since
from pyspark.sql.column import _to_seq, _to_java_column
from pyspark.sql.types import *
from pyspark.sql.types import StructType
from pyspark.sql import utils
from pyspark.sql.utils import to_str

Expand Down Expand Up @@ -1225,7 +1226,6 @@ def overwrite(self, condition):
Overwrite rows matching the given filter condition with the contents of the data frame in
the output table.
"""
condition = _to_java_column(column)
self._jwriter.overwrite(condition)

@since(3.1)
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/sql/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pyspark import since, keyword_only
from pyspark.sql.column import _to_seq
from pyspark.sql.readwriter import OptionUtils, to_str
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.utils import ForeachBatchFunction, StreamingQueryException

__all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader", "DataStreamWriter"]
Expand Down Expand Up @@ -1239,8 +1239,8 @@ def _test():
globs = pyspark.sql.streaming.__dict__.copy()
try:
spark = SparkSession.builder.getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession(sc)
except py4j.protocol.Py4JError: # noqa: F821
spark = SparkSession(sc) # noqa: F821

globs['tempfile'] = tempfile
globs['os'] = os
Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/sql/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StringType, IntegerType, LongType, \
FloatType, DoubleType, DecimalType, DateType, TimestampType, BinaryType, StructField, MapType, \
ArrayType
from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
pandas_requirement_message, pyarrow_requirement_message
from pyspark.testing.utils import QuietTest
Expand Down Expand Up @@ -495,7 +497,7 @@ def conf(cls):


if __name__ == "__main__":
from pyspark.sql.tests.test_arrow import *
from pyspark.sql.tests.test_arrow import * # noqa: F401

try:
import xmlrunner
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#

from pyspark.sql import Column, Row
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql.utils import AnalysisException
from pyspark.testing.sqlutils import ReusedSQLTestCase

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/tests/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from pyspark import SparkContext, SQLContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StringType, StructField
from pyspark.sql.window import Window
from pyspark.testing.utils import ReusedPySparkTestCase

Expand Down
5 changes: 3 additions & 2 deletions python/pyspark/sql/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
import unittest

from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \
BooleanType, DateType, TimestampType, FloatType
from pyspark.sql.utils import AnalysisException, IllegalArgumentException
from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils, have_pyarrow, have_pandas, \
pandas_requirement_message, pyarrow_requirement_message
Expand Down Expand Up @@ -903,7 +904,7 @@ def test_query_execution_listener_on_collect_with_arrow(self):


if __name__ == "__main__":
from pyspark.sql.tests.test_dataframe import *
from pyspark.sql.tests.test_dataframe import * # noqa: F401

try:
import xmlrunner
Expand Down
Loading

0 comments on commit a1e459e

Please sign in to comment.