From 37ec2df0903de6f15e21deb04284808e390e752f Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 20 Apr 2023 20:19:10 +0200 Subject: [PATCH] Spark 2.4: Remove module (#7385) --- .github/workflows/spark-ci.yml | 26 - .gitignore | 1 - dev/stage-binaries.sh | 2 +- docs/spark-configuration.md | 10 - docs/spark-ddl.md | 10 +- docs/spark-queries.md | 46 +- docs/spark-structured-streaming.md | 4 - docs/spark-writes.md | 48 +- gradle.properties | 2 +- jmh.gradle | 4 - settings.gradle | 10 - spark/build.gradle | 4 - spark/v2.4/build.gradle | 184 -- spark/v2.4/spark-runtime/LICENSE | 606 ------ spark/v2.4/spark-runtime/NOTICE | 508 ----- .../iceberg/spark/SparkBenchmarkUtil.java | 57 - .../SparkParquetReadersFlatDataBenchmark.java | 222 -- ...parkParquetReadersNestedDataBenchmark.java | 220 -- .../SparkParquetWritersFlatDataBenchmark.java | 128 -- ...parkParquetWritersNestedDataBenchmark.java | 128 -- .../apache/iceberg/spark/source/Action.java | 24 - .../spark/source/IcebergSourceBenchmark.java | 199 -- .../IcebergSourceFlatDataBenchmark.java | 59 - .../IcebergSourceNestedDataBenchmark.java | 59 - .../IcebergSourceNestedListDataBenchmark.java | 62 - .../spark/source/WritersBenchmark.java | 366 ---- .../source/avro/AvroWritersBenchmark.java | 39 - ...cebergSourceFlatAvroDataReadBenchmark.java | 142 -- ...bergSourceNestedAvroDataReadBenchmark.java | 142 -- .../IcebergSourceFlatORCDataBenchmark.java | 68 - ...IcebergSourceFlatORCDataReadBenchmark.java | 210 -- ...SourceNestedListORCDataWriteBenchmark.java | 109 - ...ebergSourceNestedORCDataReadBenchmark.java | 183 -- ...gSourceFlatParquetDataFilterBenchmark.java | 129 -- ...ergSourceFlatParquetDataReadBenchmark.java | 165 -- ...rgSourceFlatParquetDataWriteBenchmark.java | 89 - ...ceNestedListParquetDataWriteBenchmark.java | 89 - ...ourceNestedParquetDataFilterBenchmark.java | 128 -- ...gSourceNestedParquetDataReadBenchmark.java | 166 -- ...SourceNestedParquetDataWriteBenchmark.java | 88 - .../parquet/ParquetWritersBenchmark.java | 39 - ...ionaryEncodedFlatParquetDataBenchmark.java | 141 -- ...ectorizedReadFlatParquetDataBenchmark.java | 333 --- .../org/apache/iceberg/actions/Actions.java | 105 - .../actions/RewriteDataFilesAction.java | 70 - .../apache/iceberg/actions/SparkActions.java | 34 - .../apache/iceberg/spark/IcebergSpark.java | 43 - .../apache/iceberg/spark/JobGroupInfo.java | 44 - .../apache/iceberg/spark/JobGroupUtils.java | 46 - .../spark/PruneColumnsWithReordering.java | 275 --- .../spark/PruneColumnsWithoutReordering.java | 240 --- .../apache/iceberg/spark/SparkConfParser.java | 186 -- .../apache/iceberg/spark/SparkDataFile.java | 208 -- .../iceberg/spark/SparkExceptionUtil.java | 64 - .../apache/iceberg/spark/SparkFilters.java | 192 -- .../spark/SparkFixupTimestampType.java | 57 - .../apache/iceberg/spark/SparkFixupTypes.java | 63 - .../apache/iceberg/spark/SparkReadConf.java | 165 -- .../iceberg/spark/SparkReadOptions.java | 65 - .../iceberg/spark/SparkSQLProperties.java | 45 - .../apache/iceberg/spark/SparkSchemaUtil.java | 308 --- .../apache/iceberg/spark/SparkStructLike.java | 54 - .../apache/iceberg/spark/SparkTableUtil.java | 792 -------- .../apache/iceberg/spark/SparkTypeToType.java | 158 -- .../iceberg/spark/SparkTypeVisitor.java | 78 - .../org/apache/iceberg/spark/SparkUtil.java | 202 -- .../iceberg/spark/SparkValueConverter.java | 133 -- .../apache/iceberg/spark/SparkWriteConf.java | 152 -- .../iceberg/spark/SparkWriteOptions.java | 53 - .../apache/iceberg/spark/TypeToSparkType.java | 122 -- .../BaseDeleteOrphanFilesSparkAction.java | 299 --- .../BaseDeleteReachableFilesSparkAction.java | 212 -- .../BaseExpireSnapshotsSparkAction.java | 295 --- .../BaseRewriteManifestsSparkAction.java | 408 ---- .../BaseSnapshotUpdateSparkAction.java | 45 - .../spark/actions/BaseSparkAction.java | 176 -- .../spark/actions/BaseSparkActions.java | 60 - .../spark/actions/ManifestFileBean.java | 143 -- .../iceberg/spark/actions/SparkActions.java | 43 - .../data/AvroWithSparkSchemaVisitor.java | 76 - .../data/ParquetWithSparkSchemaVisitor.java | 231 --- .../iceberg/spark/data/SparkAvroReader.java | 168 -- .../iceberg/spark/data/SparkAvroWriter.java | 165 -- .../iceberg/spark/data/SparkOrcReader.java | 132 -- .../spark/data/SparkOrcValueReaders.java | 241 --- .../spark/data/SparkOrcValueWriters.java | 201 -- .../iceberg/spark/data/SparkOrcWriter.java | 226 --- .../spark/data/SparkParquetReaders.java | 769 ------- .../spark/data/SparkParquetWriters.java | 457 ----- .../iceberg/spark/data/SparkValueReaders.java | 288 --- .../iceberg/spark/data/SparkValueWriters.java | 258 --- .../ArrowVectorAccessorFactory.java | 125 -- .../data/vectorized/ArrowVectorAccessors.java | 38 - .../data/vectorized/ColumnarBatchReader.java | 64 - .../data/vectorized/ConstantColumnVector.java | 122 -- .../vectorized/IcebergArrowColumnVector.java | 159 -- .../vectorized/RowPositionColumnVector.java | 120 -- .../vectorized/VectorizedSparkOrcReaders.java | 459 ----- .../VectorizedSparkParquetReaders.java | 53 - .../iceberg/spark/source/BaseDataReader.java | 205 -- .../iceberg/spark/source/BatchDataReader.java | 130 -- .../iceberg/spark/source/CustomCatalogs.java | 111 - .../spark/source/EqualityDeleteRowReader.java | 54 - .../iceberg/spark/source/IcebergSource.java | 189 -- .../spark/source/InternalRowWrapper.java | 91 - .../apache/iceberg/spark/source/Reader.java | 591 ------ .../iceberg/spark/source/RowDataReader.java | 197 -- .../iceberg/spark/source/RowDataRewriter.java | 179 -- .../spark/source/SparkAppenderFactory.java | 318 --- .../spark/source/SparkFileWriterFactory.java | 276 --- .../source/SparkPartitionedFanoutWriter.java | 55 - .../spark/source/SparkPartitionedWriter.java | 55 - .../apache/iceberg/spark/source/Stats.java | 42 - .../iceberg/spark/source/StreamingOffset.java | 138 -- .../iceberg/spark/source/StreamingWriter.java | 120 -- .../spark/source/StructInternalRow.java | 359 ---- .../apache/iceberg/spark/source/Writer.java | 375 ---- ...pache.spark.sql.sources.DataSourceRegister | 20 - .../java/org/apache/iceberg/KryoHelpers.java | 51 - .../org/apache/iceberg/TaskCheckHelper.java | 109 - .../iceberg/TestDataFileSerialization.java | 176 -- .../iceberg/TestFileIOSerialization.java | 108 - .../TestManifestFileSerialization.java | 217 -- .../iceberg/TestScanTaskSerialization.java | 143 -- .../iceberg/TestTableSerialization.java | 97 - .../org/apache/iceberg/ValidationHelpers.java | 77 - .../actions/TestRewriteDataFilesAction.java | 469 ----- .../iceberg/examples/ConcurrencyTest.java | 123 -- .../org/apache/iceberg/examples/README.md | 195 -- .../examples/ReadAndWriteTablesTest.java | 150 -- .../iceberg/examples/SchemaEvolutionTest.java | 214 -- .../apache/iceberg/examples/SimpleRecord.java | 78 - .../examples/SnapshotFunctionalityTest.java | 147 -- .../apache/iceberg/spark/SparkTestBase.java | 219 -- .../iceberg/spark/TestSparkSchemaUtil.java | 54 - .../spark/TestSparkValueConverter.java | 94 - .../TestDeleteReachableFilesAction.java | 331 --- .../actions/TestExpireSnapshotsAction.java | 1121 ---------- .../actions/TestRemoveOrphanFilesAction.java | 737 ------- .../actions/TestRewriteManifestsAction.java | 604 ------ .../iceberg/spark/data/AvroDataTest.java | 285 --- .../iceberg/spark/data/GenericsHelpers.java | 346 ---- .../apache/iceberg/spark/data/RandomData.java | 368 ---- .../iceberg/spark/data/TestHelpers.java | 770 ------- .../iceberg/spark/data/TestOrcWrite.java | 59 - .../spark/data/TestParquetAvroReader.java | 236 --- .../spark/data/TestParquetAvroWriter.java | 123 -- .../spark/data/TestSparkAvroEnums.java | 96 - .../spark/data/TestSparkAvroReader.java | 64 - .../spark/data/TestSparkDateTimes.java | 70 - .../data/TestSparkOrcReadMetadataColumns.java | 220 -- .../spark/data/TestSparkOrcReader.java | 110 - .../TestSparkParquetReadMetadataColumns.java | 237 --- .../spark/data/TestSparkParquetReader.java | 206 -- .../spark/data/TestSparkParquetWriter.java | 119 -- .../data/TestSparkRecordOrcReaderWriter.java | 153 -- ...rquetDictionaryEncodedVectorizedReads.java | 97 - ...allbackToPlainEncodingVectorizedReads.java | 75 - .../TestParquetVectorizedReads.java | 352 ---- .../iceberg/spark/source/ComplexRecord.java | 74 - .../iceberg/spark/source/LogMessage.java | 119 -- .../iceberg/spark/source/ManualSource.java | 109 - .../iceberg/spark/source/NestedRecord.java | 77 - .../iceberg/spark/source/SimpleRecord.java | 78 - .../iceberg/spark/source/TestAvroScan.java | 111 - .../iceberg/spark/source/TestCatalog.java | 128 -- .../spark/source/TestCustomCatalog.java | 207 -- .../spark/source/TestDataFrameWrites.java | 422 ---- .../spark/source/TestDataSourceOptions.java | 409 ---- .../spark/source/TestFilteredScan.java | 658 ------ .../source/TestForwardCompatibility.java | 222 -- .../spark/source/TestIcebergSource.java | 35 - .../source/TestIcebergSourceHadoopTables.java | 67 - .../source/TestIcebergSourceHiveTables.java | 78 - .../source/TestIcebergSourceTablesBase.java | 1801 ----------------- .../spark/source/TestIcebergSpark.java | 205 -- .../source/TestIdentityPartitionData.java | 209 -- .../spark/source/TestInternalRowWrapper.java | 79 - .../source/TestNameMappingProjection.java | 210 -- .../iceberg/spark/source/TestParquetScan.java | 140 -- .../spark/source/TestPartitionPruning.java | 467 ----- .../spark/source/TestPartitionValues.java | 493 ----- .../spark/source/TestReadProjection.java | 609 ------ .../iceberg/spark/source/TestSelect.java | 252 --- .../spark/source/TestSnapshotSelection.java | 227 --- .../source/TestSparkAppenderFactory.java | 69 - .../spark/source/TestSparkBaseDataReader.java | 276 --- .../spark/source/TestSparkDataFile.java | 224 -- .../spark/source/TestSparkDataWrite.java | 655 ------ .../source/TestSparkFileWriterFactory.java | 74 - .../spark/source/TestSparkMergingMetrics.java | 73 - .../source/TestSparkPartitioningWriters.java | 74 - .../source/TestSparkPositionDeltaWriters.java | 74 - .../spark/source/TestSparkReadProjection.java | 258 --- .../spark/source/TestSparkReaderDeletes.java | 245 --- .../source/TestSparkRollingFileWriters.java | 60 - .../iceberg/spark/source/TestSparkSchema.java | 182 -- .../spark/source/TestSparkTableUtil.java | 571 ------ ...TestSparkTableUtilWithInMemoryCatalog.java | 580 ------ .../spark/source/TestSparkWriterMetrics.java | 67 - .../spark/source/TestStreamingOffset.java | 62 - .../spark/source/TestStructuredStreaming.java | 301 --- .../iceberg/spark/source/TestTables.java | 205 -- .../source/TestTimestampWithoutZone.java | 309 --- .../spark/source/TestWriteMetricsConfig.java | 298 --- .../spark/source/ThreeColumnRecord.java | 83 - .../spark/src/test/resources/data/books.json | 6 - .../src/test/resources/data/new-books.json | 4 - 208 files changed, 16 insertions(+), 40461 deletions(-) delete mode 100644 spark/v2.4/build.gradle delete mode 100644 spark/v2.4/spark-runtime/LICENSE delete mode 100644 spark/v2.4/spark-runtime/NOTICE delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java delete mode 100644 spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java delete mode 100644 spark/v2.4/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/ValidationHelpers.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/README.md delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ComplexRecord.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/NestedRecord.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java delete mode 100644 spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java delete mode 100644 spark/v2.4/spark/src/test/resources/data/books.json delete mode 100644 spark/v2.4/spark/src/test/resources/data/new-books.json diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index 794d845ad635..f01456930dde 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -55,32 +55,6 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: - spark2-tests: - runs-on: ubuntu-22.04 - env: - SPARK_LOCAL_IP: localhost - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: zulu - java-version: 8 - - uses: actions/cache@v3 - with: - path: | - ~/.gradle/caches - ~/.gradle/wrapper - key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }} - restore-keys: ${{ runner.os }}-gradle- - - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - - run: ./gradlew -DsparkVersions=2.4 -DhiveVersions= -DflinkVersions= :iceberg-spark:check :iceberg-spark:iceberg-spark-2.4:check :iceberg-spark:iceberg-spark-runtime-2.4:check -Pquick=true -x javadoc - - uses: actions/upload-artifact@v3 - if: failure() - with: - name: test logs - path: | - **/build/testlogs - spark-3x-scala-2-12-tests: runs-on: ubuntu-22.04 strategy: diff --git a/.gitignore b/.gitignore index 54e4275accdb..13e95b24648b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,6 @@ lib/ site/site # benchmark output -spark/v2.4/spark/benchmark/* spark/v3.1/spark/benchmark/* spark/v3.2/spark/benchmark/* spark/v3.3/spark/benchmark/* diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh index 4fc0c514c695..0ec940f64fae 100755 --- a/dev/stage-binaries.sh +++ b/dev/stage-binaries.sh @@ -20,7 +20,7 @@ SCALA_VERSION=2.12 FLINK_VERSIONS=1.15,1.16,1.17 -SPARK_VERSIONS=2.4,3.1,3.2,3.3,3.4 +SPARK_VERSIONS=3.1,3.2,3.3,3.4 HIVE_VERSIONS=2,3 ./gradlew -Prelease -DscalaVersion=$SCALA_VERSION -DflinkVersions=$FLINK_VERSIONS -DsparkVersions=$SPARK_VERSIONS -DhiveVersions=$HIVE_VERSIONS publishApachePublicationToMavenRepository diff --git a/docs/spark-configuration.md b/docs/spark-configuration.md index 70c415db3e61..926ec0207dad 100644 --- a/docs/spark-configuration.md +++ b/docs/spark-configuration.md @@ -124,13 +124,6 @@ spark.sql.catalog.custom_prod.catalog-impl = com.my.custom.CatalogImpl spark.sql.catalog.custom_prod.my-additional-catalog-config = my-value ``` -### Catalogs in Spark 2.4 - -When using Iceberg 0.11.0 and later, Spark 2.4 can load tables from multiple Iceberg catalogs or from table locations. - -Catalogs in 2.4 are configured just like catalogs in 3.x, but only Iceberg catalogs are supported. - - ## SQL Extensions Iceberg 0.11.0 and later add an extension module to Spark to add new SQL commands, like `CALL` for stored procedures or `ALTER TABLE ... WRITE ORDERED BY`. @@ -142,9 +135,6 @@ Using those SQL commands requires adding Iceberg extensions to your Spark enviro |---------------------------|---------------------------------------------------------------------| | `spark.sql.extensions` | `org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions` | -SQL extensions are not available for Spark 2.4. - - ## Runtime configuration ### Read options diff --git a/docs/spark-ddl.md b/docs/spark-ddl.md index 2f979b5443d6..aaff5a05e640 100644 --- a/docs/spark-ddl.md +++ b/docs/spark-ddl.md @@ -27,13 +27,7 @@ menu: # Spark DDL -To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). - -Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions. Spark 2.4 does not support SQL DDL. - -{{< hint info >}} -Spark 2.4 can't create Iceberg tables with DDL, instead use Spark 3 or the [Iceberg API](..//java-api-quickstart). -{{< /hint >}} +To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. ## `CREATE TABLE` @@ -256,7 +250,7 @@ ADD COLUMN points.value.b int Note: Altering a map 'key' column by adding columns is not allowed. Only map values can be updated. -In Spark 2.4.4 and later, you can add columns in any position by adding `FIRST` or `AFTER` clauses: +Add columns in any position by adding `FIRST` or `AFTER` clauses: ```sql ALTER TABLE prod.db.sample diff --git a/docs/spark-queries.md b/docs/spark-queries.md index 7d5ec60681f1..f2ebf1d893f6 100644 --- a/docs/spark-queries.md +++ b/docs/spark-queries.md @@ -27,22 +27,7 @@ menu: # Spark Queries -To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). - -Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions: - -| Feature support | Spark 3 | Spark 2.4 | Notes | -|--------------------------------------------------|-----------|------------|------------------------------------------------| -| [`SELECT`](#querying-with-sql) | ✔️ | | | -| [DataFrame reads](#querying-with-dataframes) | ✔️ | ✔️ | | -| [Metadata table `SELECT`](#inspecting-tables) | ✔️ | | | -| [History metadata table](#history) | ✔️ | ✔️ | | -| [Snapshots metadata table](#snapshots) | ✔️ | ✔️ | | -| [Files metadata table](#files) | ✔️ | ✔️ | | -| [Manifests metadata table](#manifests) | ✔️ | ✔️ | | -| [Partitions metadata table](#partitions) | ✔️ | ✔️ | | -| [All metadata tables](#all-metadata-tables) | ✔️ | ✔️ | | - +To use Iceberg in Spark, first configure [Spark catalogs](../spark-configuration). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. ## Querying with SQL @@ -75,8 +60,6 @@ val df = spark.table("prod.db.table") ### Catalogs with DataFrameReader -Iceberg 0.11.0 adds multi-catalog support to `DataFrameReader` in both Spark 3 and 2.4. - Paths and table names can be loaded with Spark's `DataFrameReader` interface. How tables are loaded depends on how the identifier is specified. When using `spark.read.format("iceberg").load(table)` or `spark.table(table)` the `table` variable can take a number of forms as listed below: @@ -205,29 +188,6 @@ Incremental read works with both V1 and V2 format-version. Incremental read is not supported by Spark's SQL syntax. {{< /hint >}} -### Spark 2.4 - -Spark 2.4 requires using the DataFrame reader with `iceberg` as a format, because 2.4 does not support direct SQL queries: - -```scala -// named metastore table -spark.read.format("iceberg").load("catalog.db.table") -// Hadoop path table -spark.read.format("iceberg").load("hdfs://nn:8020/path/to/table") -``` - -#### Spark 2.4 with SQL - -To run SQL `SELECT` statements on Iceberg tables in 2.4, register the DataFrame as a temporary table: - -```scala -val df = spark.read.format("iceberg").load("db.table") -df.createOrReplaceTempView("table") - -spark.sql("""select count(1) from table""").show() -``` - - ## Inspecting tables To inspect a table's history, snapshots, and other metadata, Iceberg supports metadata tables. @@ -235,8 +195,6 @@ To inspect a table's history, snapshots, and other metadata, Iceberg supports me Metadata tables are identified by adding the metadata table name after the original table name. For example, history for `db.table` is read using `db.table.history`. {{< hint info >}} -For Spark 2.4, use the `DataFrameReader` API to [inspect tables](#inspecting-with-dataframes). - For Spark 3, prior to 3.2, the Spark [session catalog](../spark-configuration#replacing-the-session-catalog) does not support table names with multipart identifiers such as `catalog.database.table.metadata`. As a workaround, configure an `org.apache.iceberg.spark.SparkCatalog`, or use the Spark `DataFrameReader` API. {{< /hint >}} @@ -422,7 +380,7 @@ SELECT * FROM prod.db.table.refs; ### Inspecting with DataFrames -Metadata tables can be loaded in Spark 2.4 or Spark 3 using the DataFrameReader API: +Metadata tables can be loaded using the DataFrameReader API: ```scala // named metastore table diff --git a/docs/spark-structured-streaming.md b/docs/spark-structured-streaming.md index bdb4b34057a3..77a79608c341 100644 --- a/docs/spark-structured-streaming.md +++ b/docs/spark-structured-streaming.md @@ -32,10 +32,6 @@ with different levels of support in Spark versions. As of Spark 3, DataFrame reads and writes are supported. -| Feature support | Spark 3 | Spark 2.4 | Notes | -|--------------------------------------------------|-----------|------------|------------------------------------------------| -| [DataFrame write](#streaming-writes) | ✔ | ✔ | | - ## Streaming Reads Iceberg supports processing incremental data in spark structured streaming jobs which starts from a historical timestamp: diff --git a/docs/spark-writes.md b/docs/spark-writes.md index 08f788fe26bb..2a89fa8721f4 100644 --- a/docs/spark-writes.md +++ b/docs/spark-writes.md @@ -33,16 +33,16 @@ Some plans are only available when using [Iceberg SQL extensions](../spark-confi Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions: -| Feature support | Spark 3 | Spark 2.4 | Notes | -|--------------------------------------------------|-----------|------------|----------------------------------------------| -| [SQL insert into](#insert-into) | ✔️ | | | -| [SQL merge into](#merge-into) | ✔️ | | ⚠ Requires Iceberg Spark extensions | -| [SQL insert overwrite](#insert-overwrite) | ✔️ | | | -| [SQL delete from](#delete-from) | ✔️ | | ⚠ Row-level delete requires Spark extensions | -| [SQL update](#update) | ✔️ | | ⚠ Requires Iceberg Spark extensions | -| [DataFrame append](#appending-data) | ✔️ | ✔️ | | -| [DataFrame overwrite](#overwriting-data) | ✔️ | ✔️ | ⚠ Behavior changed in Spark 3 | -| [DataFrame CTAS and RTAS](#creating-tables) | ✔️ | | | +| Feature support | Spark 3 | Notes | +|--------------------------------------------------|-----------|----------------------------------------------| +| [SQL insert into](#insert-into) | ✔️ | | +| [SQL merge into](#merge-into) | ✔️ | ⚠ Requires Iceberg Spark extensions | +| [SQL insert overwrite](#insert-overwrite) | ✔️ | | +| [SQL delete from](#delete-from) | ✔️ | ⚠ Row-level delete requires Spark extensions | +| [SQL update](#update) | ✔️ | ⚠ Requires Iceberg Spark extensions | +| [DataFrame append](#appending-data) | ✔️ | | +| [DataFrame overwrite](#overwriting-data) | ✔️ | | +| [DataFrame CTAS and RTAS](#creating-tables) | ✔️ | | ## Writing with SQL @@ -234,17 +234,6 @@ val data: DataFrame = ... data.writeTo("prod.db.table").append() ``` -#### Spark 2.4 - -In Spark 2.4, use the v1 API with `append` mode and `iceberg` format: - -```scala -data.write - .format("iceberg") - .mode("append") - .save("db.table") -``` - ### Overwriting data To overwrite partitions dynamically, use `overwritePartitions()`: @@ -260,23 +249,6 @@ To explicitly overwrite partitions, use `overwrite` to supply a filter: data.writeTo("prod.db.table").overwrite($"level" === "INFO") ``` -#### Spark 2.4 - -In Spark 2.4, overwrite values in an Iceberg table with `overwrite` mode and `iceberg` format: - -```scala -data.write - .format("iceberg") - .mode("overwrite") - .save("db.table") -``` - -{{< hint danger >}} -**The behavior of overwrite mode changed between Spark 2.4 and Spark 3**. -{{< /hint >}} - -The behavior of DataFrameWriter overwrite mode was undefined in Spark 2.4, but is required to overwrite the entire table in Spark 3. Because of this new requirement, the Iceberg source's behavior changed in Spark 3. In Spark 2.4, the behavior was to dynamically overwrite partitions. To use the Spark 2.4 behavior, add option `overwrite-mode=dynamic`. - ### Creating tables To run a CTAS or RTAS, use `create`, `replace`, or `createOrReplace` operations: diff --git a/gradle.properties b/gradle.properties index 8d8a9f0dc021..eb0da0ac8547 100644 --- a/gradle.properties +++ b/gradle.properties @@ -21,7 +21,7 @@ systemProp.knownFlinkVersions=1.15,1.16,1.17 systemProp.defaultHiveVersions=2 systemProp.knownHiveVersions=2,3 systemProp.defaultSparkVersions=3.4 -systemProp.knownSparkVersions=2.4,3.1,3.2,3.3,3.4 +systemProp.knownSparkVersions=3.1,3.2,3.3,3.4 systemProp.defaultScalaVersion=2.12 systemProp.knownScalaVersions=2.12,2.13 org.gradle.parallel=true diff --git a/jmh.gradle b/jmh.gradle index e560365931b8..31d544838b3b 100644 --- a/jmh.gradle +++ b/jmh.gradle @@ -25,10 +25,6 @@ def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getPro def scalaVersion = System.getProperty("scalaVersion") != null ? System.getProperty("scalaVersion") : System.getProperty("defaultScalaVersion") def jmhProjects = [project(":iceberg-core")] -if (jdkVersion == '8' && sparkVersions.contains("2.4")) { - jmhProjects.add(project(":iceberg-spark:iceberg-spark-2.4")) -} - if (sparkVersions.contains("3.1")) { jmhProjects.add(project(":iceberg-spark:iceberg-spark-3.1_2.12")) } diff --git a/settings.gradle b/settings.gradle index 40db31aa1282..8e84af8553e7 100644 --- a/settings.gradle +++ b/settings.gradle @@ -183,16 +183,6 @@ if (hiveVersions.contains("2") || hiveVersions.contains("3")) { } if (JavaVersion.current() == JavaVersion.VERSION_1_8) { - if (sparkVersions.contains("2.4")) { - include ':iceberg-spark:spark-2.4' - include ':iceberg-spark:spark-runtime-2.4' - - project(':iceberg-spark:spark-2.4').projectDir = file('spark/v2.4/spark') - project(':iceberg-spark:spark-2.4').name = 'iceberg-spark-2.4' - project(':iceberg-spark:spark-runtime-2.4').projectDir = file('spark/v2.4/spark-runtime') - project(':iceberg-spark:spark-runtime-2.4').name = 'iceberg-spark-runtime-2.4' - } - if (hiveVersions.contains("3")) { include 'hive3' include 'hive3-orc-bundle' diff --git a/spark/build.gradle b/spark/build.gradle index f9947e34a034..77efb998d6f5 100644 --- a/spark/build.gradle +++ b/spark/build.gradle @@ -20,10 +20,6 @@ // add enabled Spark version modules to the build def sparkVersions = (System.getProperty("sparkVersions") != null ? System.getProperty("sparkVersions") : System.getProperty("defaultSparkVersions")).split(",") -if (jdkVersion == '8' && sparkVersions.contains("2.4")) { - apply from: file("$projectDir/v2.4/build.gradle") -} - if (sparkVersions.contains("3.1")) { apply from: file("$projectDir/v3.1/build.gradle") } diff --git a/spark/v2.4/build.gradle b/spark/v2.4/build.gradle deleted file mode 100644 index 20599c9a9cd0..000000000000 --- a/spark/v2.4/build.gradle +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -if (jdkVersion != '8') { - throw new GradleException("Spark 2.4 must be built with Java 8") -} - -def sparkProjects = [ - project(':iceberg-spark:iceberg-spark-2.4'), - project(':iceberg-spark:iceberg-spark-runtime-2.4') -] - -configure(sparkProjects) { - project.ext { - sparkVersion = '2.4.8' - } - - configurations { - all { - resolutionStrategy { - force 'com.fasterxml.jackson.module:jackson-module-scala_2.11:2.11.4' - force 'com.fasterxml.jackson.module:jackson-module-paranamer:2.11.4' - force 'com.fasterxml.jackson.core:jackson-core:2.11.4' - force 'com.fasterxml.jackson.core:jackson-databind:2.11.4' - } - } - } -} - -project(':iceberg-spark:iceberg-spark-2.4') { - configurations.all { - resolutionStrategy { - // Spark 2.4.4 can only use the below datanucleus version, the versions introduced - // by Hive 2.3.6 will meet lots of unexpected issues, so here force to use the versions - // introduced by Hive 1.2.1. - force 'org.datanucleus:datanucleus-api-jdo:3.2.6' - force 'org.datanucleus:datanucleus-core:3.2.10' - force 'org.datanucleus:datanucleus-rdbms:3.2.9' - } - } - - dependencies { - implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - api project(':iceberg-api') - implementation project(':iceberg-common') - implementation project(':iceberg-core') - implementation project(':iceberg-data') - implementation project(':iceberg-orc') - implementation project(':iceberg-parquet') - implementation project(':iceberg-arrow') - implementation "com.github.ben-manes.caffeine:caffeine" - - compileOnly "com.google.errorprone:error_prone_annotations" - compileOnly "org.apache.avro:avro" - compileOnly("org.apache.spark:spark-hive_2.11:${sparkVersion}") { - exclude group: 'org.apache.avro', module: 'avro' - exclude group: 'org.roaringbitmap' - } - - implementation("org.apache.orc:orc-core::nohive") { - exclude group: 'org.apache.hadoop' - exclude group: 'commons-lang' - // These artifacts are shaded and included in the orc-core fat jar - exclude group: 'com.google.protobuf', module: 'protobuf-java' - exclude group: 'org.apache.hive', module: 'hive-storage-api' - } - - implementation("org.apache.arrow:arrow-vector") { - exclude group: 'io.netty', module: 'netty-buffer' - exclude group: 'io.netty', module: 'netty-common' - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - - testImplementation("org.apache.hadoop:hadoop-minicluster") { - exclude group: 'org.apache.avro', module: 'avro' - } - testImplementation project(path: ':iceberg-hive-metastore') - testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') - testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - } - - test { - // For vectorized reads - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds - systemProperty("arrow.enable_unsafe_memory_access", "true") - // Disable expensive null check for every get(index) call. - // Iceberg manages nullability checks itself instead of relying on arrow. - systemProperty("arrow.enable_null_check_for_get", "false") - - // Vectorized reads need more memory - maxHeapSize '2500m' - } -} - -// the runtime jar is a self-contained artifact for testing in a notebook -project(':iceberg-spark:iceberg-spark-runtime-2.4') { - apply plugin: 'com.github.johnrengelman.shadow' - - tasks.jar.dependsOn tasks.shadowJar - - configurations { - implementation { - exclude group: 'org.apache.spark' - // included in Spark - exclude group: 'org.slf4j' - exclude group: 'org.apache.commons' - exclude group: 'commons-pool' - exclude group: 'commons-codec' - exclude group: 'org.xerial.snappy' - exclude group: 'javax.xml.bind' - exclude group: 'javax.annotation' - } - } - - dependencies { - implementation project(':iceberg-spark:iceberg-spark-2.4') - implementation project(':iceberg-aws') - implementation(project(':iceberg-aliyun')) { - exclude group: 'edu.umd.cs.findbugs', module: 'findbugs' - exclude group: 'org.apache.httpcomponents', module: 'httpclient' - exclude group: 'commons-logging', module: 'commons-logging' - } - implementation project(':iceberg-hive-metastore') - implementation(project(':iceberg-nessie')) { - exclude group: 'com.google.code.findbugs', module: 'jsr305' - } - } - - shadowJar { - configurations = [project.configurations.runtimeClasspath] - - zip64 true - - // include the LICENSE and NOTICE files for the shaded Jar - from(projectDir) { - include 'LICENSE' - include 'NOTICE' - } - - // Relocate dependencies to avoid conflicts - relocate 'com.google', 'org.apache.iceberg.shaded.com.google' - relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' - relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' - relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' - relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' - relocate 'avro.shaded', 'org.apache.iceberg.shaded.org.apache.avro.shaded' - relocate 'com.thoughtworks.paranamer', 'org.apache.iceberg.shaded.com.thoughtworks.paranamer' - relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' - relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' - relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' - relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' - relocate 'org.apache.httpcomponents.client5', 'org.apache.iceberg.shaded.org.apache.httpcomponents.client5' - // relocate Arrow and related deps to shade Iceberg specific version - relocate 'io.netty.buffer', 'org.apache.iceberg.shaded.io.netty.buffer' - relocate 'org.apache.arrow', 'org.apache.iceberg.shaded.org.apache.arrow' - relocate 'com.carrotsearch', 'org.apache.iceberg.shaded.com.carrotsearch' - relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' - relocate 'org.roaringbitmap', 'org.apache.iceberg.shaded.org.roaringbitmap' - - archiveClassifier.set(null) - } - - jar { - enabled = false - } -} diff --git a/spark/v2.4/spark-runtime/LICENSE b/spark/v2.4/spark-runtime/LICENSE deleted file mode 100644 index 79cd289d1f42..000000000000 --- a/spark/v2.4/spark-runtime/LICENSE +++ /dev/null @@ -1,606 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Avro. - -Copyright: 2014-2017 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains the Jackson JSON processor. - -Copyright: 2007-2019 Tatu Saloranta and other contributors -Home page: http://jackson.codehaus.org/ -License: http://www.apache.org/licenses/LICENSE-2.0.txt - --------------------------------------------------------------------------------- - -This binary artifact contains Paranamer. - -Copyright: 2000-2007 INRIA, France Telecom, 2006-2018 Paul Hammant & ThoughtWorks Inc -Home page: https://github.com/paul-hammant/paranamer -License: https://github.com/paul-hammant/paranamer/blob/master/LICENSE.txt (BSD) - -License text: -| Portions copyright (c) 2006-2018 Paul Hammant & ThoughtWorks Inc -| Portions copyright (c) 2000-2007 INRIA, France Telecom -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions -| are met: -| 1. Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| 2. Redistributions in binary form must reproduce the above copyright -| notice, this list of conditions and the following disclaimer in the -| documentation and/or other materials provided with the distribution. -| 3. Neither the name of the copyright holders nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -| THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Parquet. - -Copyright: 2014-2017 The Apache Software Foundation. -Home page: https://parquet.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Thrift. - -Copyright: 2006-2010 The Apache Software Foundation. -Home page: https://thrift.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains code from Daniel Lemire's JavaFastPFOR project. - -Copyright: 2013 Daniel Lemire -Home page: https://github.com/lemire/JavaFastPFOR -License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains fastutil. - -Copyright: 2002-2014 Sebastiano Vigna -Home page: http://fastutil.di.unimi.it/ -License: http://www.apache.org/licenses/LICENSE-2.0.html - --------------------------------------------------------------------------------- - -This binary artifact contains Apache ORC. - -Copyright: 2013-2019 The Apache Software Foundation. -Home page: https://orc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Hive's storage API via ORC. - -Copyright: 2013-2019 The Apache Software Foundation. -Home page: https://hive.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google protobuf via ORC. - -Copyright: 2008 Google Inc. -Home page: https://developers.google.com/protocol-buffers -License: https://github.com/protocolbuffers/protobuf/blob/master/LICENSE (BSD) - -License text: - -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Aircompressor. - -Copyright: 2011-2019 Aircompressor authors. -Home page: https://github.com/airlift/aircompressor -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Airlift Slice. - -Copyright: 2013-2019 Slice authors. -Home page: https://github.com/airlift/slice -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains JetBrains annotations. - -Copyright: 2000-2020 JetBrains s.r.o. -Home page: https://github.com/JetBrains/java-annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains code from Cloudera Kite. - -Copyright: 2013-2017 Cloudera Inc. -Home page: https://kitesdk.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains code from Presto. - -Copyright: 2016 Facebook and contributors -Home page: https://prestodb.io/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Guava. - -Copyright: 2006-2019 The Guava Authors -Home page: https://github.com/google/guava -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google Error Prone Annotations. - -Copyright: Copyright 2011-2019 The Error Prone Authors -Home page: https://github.com/google/error-prone -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains findbugs-annotations by Stephen Connolly. - -Copyright: 2011-2016 Stephen Connolly, Greg Lucas -Home page: https://github.com/stephenc/findbugs-annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google j2objc Annotations. - -Copyright: Copyright 2012-2018 Google Inc. -Home page: https://github.com/google/j2objc/tree/master/annotations -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains checkerframework checker-qual Annotations. - -Copyright: 2004-2019 the Checker Framework developers -Home page: https://github.com/typetools/checker-framework -License: https://github.com/typetools/checker-framework/blob/master/LICENSE.txt (MIT license) - -License text: -| The annotations are licensed under the MIT License. (The text of this -| license appears below.) More specifically, all the parts of the Checker -| Framework that you might want to include with your own program use the -| MIT License. This is the checker-qual.jar file and all the files that -| appear in it: every file in a qual/ directory, plus utility files such -| as NullnessUtil.java, RegexUtil.java, SignednessUtil.java, etc. -| In addition, the cleanroom implementations of third-party annotations, -| which the Checker Framework recognizes as aliases for its own -| annotations, are licensed under the MIT License. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Animal Sniffer Annotations. - -Copyright: 2009-2018 codehaus.org -Home page: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/ -License: https://www.mojohaus.org/animal-sniffer/animal-sniffer-annotations/license.html (MIT license) - -License text: -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This binary artifact contains Caffeine by Ben Manes. - -Copyright: 2014-2019 Ben Manes and contributors -Home page: https://github.com/ben-manes/caffeine -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Arrow. - -Copyright: 2016-2019 The Apache Software Foundation. -Home page: https://arrow.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Netty's buffer library. - -Copyright: 2014-2020 The Netty Project -Home page: https://netty.io/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Google FlatBuffers. - -Copyright: 2013-2020 Google Inc. -Home page: https://google.github.io/flatbuffers/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Carrot Search Labs HPPC. - -Copyright: 2002-2019 Carrot Search s.c. -Home page: http://labs.carrotsearch.com/hppc.html -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains code from Apache Lucene via Carrot Search HPPC. - -Copyright: 2011-2020 The Apache Software Foundation. -Home page: https://lucene.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache Yetus audience annotations. - -Copyright: 2008-2020 The Apache Software Foundation. -Home page: https://yetus.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains ThreeTen. - -Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. -Home page: https://www.threeten.org/threeten-extra/ -License: https://github.com/ThreeTen/threeten-extra/blob/master/LICENSE.txt (BSD 3-clause) - -License text: - -| All rights reserved. -| -| * Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact contains code from Project Nessie. - -Copyright: 2020 Dremio Corporation. -Home page: https://projectnessie.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary includes code from Apache Commons. - -* Core ArrayUtil. - -Copyright: 2020 The Apache Software Foundation -Home page: https://commons.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This binary artifact contains Apache HttpComponents Client. - -Copyright: 1999-2022 The Apache Software Foundation. -Home page: https://hc.apache.org/ -License: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v2.4/spark-runtime/NOTICE b/spark/v2.4/spark-runtime/NOTICE deleted file mode 100644 index 4a1f4dfde1cc..000000000000 --- a/spark/v2.4/spark-runtime/NOTICE +++ /dev/null @@ -1,508 +0,0 @@ - -Apache Iceberg -Copyright 2017-2022 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - --------------------------------------------------------------------------------- - -This binary artifact contains code from Kite, developed at Cloudera, Inc. with -the following copyright notice: - -| Copyright 2013 Cloudera Inc. -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. - --------------------------------------------------------------------------------- - -This binary artifact includes Apache ORC with the following in its NOTICE file: - -| Apache ORC -| Copyright 2013-2019 The Apache Software Foundation -| -| This product includes software developed by The Apache Software -| Foundation (http://www.apache.org/). -| -| This product includes software developed by Hewlett-Packard: -| (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P - --------------------------------------------------------------------------------- - -This binary artifact includes Airlift Aircompressor with the following in its -NOTICE file: - -| Snappy Copyright Notices -| ========================= -| -| * Copyright 2011 Dain Sundstrom -| * Copyright 2011, Google Inc. -| -| -| Snappy License -| =============== -| Copyright 2011, Google Inc. -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This binary artifact includes Carrot Search Labs HPPC with the following in its -NOTICE file: - -| ACKNOWLEDGEMENT -| =============== -| -| HPPC borrowed code, ideas or both from: -| -| * Apache Lucene, http://lucene.apache.org/ -| (Apache license) -| * Fastutil, http://fastutil.di.unimi.it/ -| (Apache license) -| * Koloboke, https://github.com/OpenHFT/Koloboke -| (Apache license) - --------------------------------------------------------------------------------- - -This binary artifact includes Apache Yetus with the following in its NOTICE -file: - -| Apache Yetus -| Copyright 2008-2020 The Apache Software Foundation -| -| This product includes software developed at -| The Apache Software Foundation (https://www.apache.org/). -| -| --- -| Additional licenses for the Apache Yetus Source/Website: -| --- -| -| -| See LICENSE for terms. - --------------------------------------------------------------------------------- - -This binary artifact includes Google Protobuf with the following copyright -notice: - -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This binary artifact includes Apache Arrow with the following in its NOTICE file: - -| Apache Arrow -| Copyright 2016-2019 The Apache Software Foundation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). -| -| This product includes software from the SFrame project (BSD, 3-clause). -| * Copyright (C) 2015 Dato, Inc. -| * Copyright (c) 2009 Carnegie Mellon University. -| -| This product includes software from the Feather project (Apache 2.0) -| https://github.com/wesm/feather -| -| This product includes software from the DyND project (BSD 2-clause) -| https://github.com/libdynd -| -| This product includes software from the LLVM project -| * distributed under the University of Illinois Open Source -| -| This product includes software from the google-lint project -| * Copyright (c) 2009 Google Inc. All rights reserved. -| -| This product includes software from the mman-win32 project -| * Copyright https://code.google.com/p/mman-win32/ -| * Licensed under the MIT License; -| -| This product includes software from the LevelDB project -| * Copyright (c) 2011 The LevelDB Authors. All rights reserved. -| * Use of this source code is governed by a BSD-style license that can be -| * Moved from Kudu http://github.com/cloudera/kudu -| -| This product includes software from the CMake project -| * Copyright 2001-2009 Kitware, Inc. -| * Copyright 2012-2014 Continuum Analytics, Inc. -| * All rights reserved. -| -| This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause) -| * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved. -| -| This product includes software from the Ibis project (Apache 2.0) -| * Copyright (c) 2015 Cloudera, Inc. -| * https://github.com/cloudera/ibis -| -| This product includes software from Dremio (Apache 2.0) -| * Copyright (C) 2017-2018 Dremio Corporation -| * https://github.com/dremio/dremio-oss -| -| This product includes software from Google Guava (Apache 2.0) -| * Copyright (C) 2007 The Guava Authors -| * https://github.com/google/guava -| -| This product include software from CMake (BSD 3-Clause) -| * CMake - Cross Platform Makefile Generator -| * Copyright 2000-2019 Kitware, Inc. and Contributors -| -| The web site includes files generated by Jekyll. -| -| -------------------------------------------------------------------------------- -| -| This product includes code from Apache Kudu, which includes the following in -| its NOTICE file: -| -| Apache Kudu -| Copyright 2016 The Apache Software Foundation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). -| -| Portions of this software were developed at -| Cloudera, Inc (http://www.cloudera.com/). -| -| -------------------------------------------------------------------------------- -| -| This product includes code from Apache ORC, which includes the following in -| its NOTICE file: -| -| Apache ORC -| Copyright 2013-2019 The Apache Software Foundation -| -| This product includes software developed by The Apache Software -| Foundation (http://www.apache.org/). -| -| This product includes software developed by Hewlett-Packard: -| (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P - --------------------------------------------------------------------------------- - -This binary artifact includes Netty buffers with the following in its NOTICE -file: - -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * https://netty.io/ -| -| Copyright 2014 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| Also, please refer to each LICENSE..txt file, which is located in -| the 'license' directory of the distribution file, for the license terms of the -| components that this product depends on. -| -| ------------------------------------------------------------------------------- -| This product contains the extensions to Java Collections Framework which has -| been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: -| -| * LICENSE: -| * license/LICENSE.jsr166y.txt (Public Domain) -| * HOMEPAGE: -| * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ -| * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ -| -| This product contains a modified version of Robert Harder's Public Domain -| Base64 Encoder and Decoder, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.base64.txt (Public Domain) -| * HOMEPAGE: -| * http://iharder.sourceforge.net/current/java/base64/ -| -| This product contains a modified portion of 'Webbit', an event based -| WebSocket and HTTP server, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.webbit.txt (BSD License) -| * HOMEPAGE: -| * https://github.com/joewalnes/webbit -| -| This product contains a modified portion of 'SLF4J', a simple logging -| facade for Java, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.slf4j.txt (MIT License) -| * HOMEPAGE: -| * http://www.slf4j.org/ -| -| This product contains a modified portion of 'Apache Harmony', an open source -| Java SE, which can be obtained at: -| -| * NOTICE: -| * license/NOTICE.harmony.txt -| * LICENSE: -| * license/LICENSE.harmony.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://archive.apache.org/dist/harmony/ -| -| This product contains a modified portion of 'jbzip2', a Java bzip2 compression -| and decompression library written by Matthew J. Francis. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.jbzip2.txt (MIT License) -| * HOMEPAGE: -| * https://code.google.com/p/jbzip2/ -| -| This product contains a modified portion of 'libdivsufsort', a C API library to construct -| the suffix array and the Burrows-Wheeler transformed string for any input string of -| a constant-size alphabet written by Yuta Mori. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.libdivsufsort.txt (MIT License) -| * HOMEPAGE: -| * https://github.com/y-256/libdivsufsort -| -| This product contains a modified portion of Nitsan Wakart's 'JCTools', Java Concurrency Tools for the JVM, -| which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.jctools.txt (ASL2 License) -| * HOMEPAGE: -| * https://github.com/JCTools/JCTools -| -| This product optionally depends on 'JZlib', a re-implementation of zlib in -| pure Java, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.jzlib.txt (BSD style License) -| * HOMEPAGE: -| * http://www.jcraft.com/jzlib/ -| -| This product optionally depends on 'Compress-LZF', a Java library for encoding and -| decoding data in LZF format, written by Tatu Saloranta. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.compress-lzf.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/ning/compress -| -| This product optionally depends on 'lz4', a LZ4 Java compression -| and decompression library written by Adrien Grand. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.lz4.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/jpountz/lz4-java -| -| This product optionally depends on 'lzma-java', a LZMA Java compression -| and decompression library, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.lzma-java.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/jponge/lzma-java -| -| This product contains a modified portion of 'jfastlz', a Java port of FastLZ compression -| and decompression library written by William Kinney. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.jfastlz.txt (MIT License) -| * HOMEPAGE: -| * https://code.google.com/p/jfastlz/ -| -| This product contains a modified portion of and optionally depends on 'Protocol Buffers', Google's data -| interchange format, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.protobuf.txt (New BSD License) -| * HOMEPAGE: -| * https://github.com/google/protobuf -| -| This product optionally depends on 'Bouncy Castle Crypto APIs' to generate -| a temporary self-signed X.509 certificate when the JVM does not provide the -| equivalent functionality. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.bouncycastle.txt (MIT License) -| * HOMEPAGE: -| * http://www.bouncycastle.org/ -| -| This product optionally depends on 'Snappy', a compression library produced -| by Google Inc, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.snappy.txt (New BSD License) -| * HOMEPAGE: -| * https://github.com/google/snappy -| -| This product optionally depends on 'JBoss Marshalling', an alternative Java -| serialization API, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.jboss-marshalling.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/jboss-remoting/jboss-marshalling -| -| This product optionally depends on 'Caliper', Google's micro- -| benchmarking framework, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.caliper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/google/caliper -| -| This product optionally depends on 'Apache Commons Logging', a logging -| framework, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.commons-logging.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://commons.apache.org/logging/ -| -| This product optionally depends on 'Apache Log4J', a logging framework, which -| can be obtained at: -| -| * LICENSE: -| * license/LICENSE.log4j.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://logging.apache.org/log4j/ -| -| This product optionally depends on 'Aalto XML', an ultra-high performance -| non-blocking XML processor, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.aalto-xml.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://wiki.fasterxml.com/AaltoHome -| -| This product contains a modified version of 'HPACK', a Java implementation of -| the HTTP/2 HPACK algorithm written by Twitter. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.hpack.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/twitter/hpack -| -| This product contains a modified version of 'HPACK', a Java implementation of -| the HTTP/2 HPACK algorithm written by Cory Benfield. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.hyper-hpack.txt (MIT License) -| * HOMEPAGE: -| * https://github.com/python-hyper/hpack/ -| -| This product contains a modified version of 'HPACK', a Java implementation of -| the HTTP/2 HPACK algorithm written by Tatsuhiro Tsujikawa. It can be obtained at: -| -| * LICENSE: -| * license/LICENSE.nghttp2-hpack.txt (MIT License) -| * HOMEPAGE: -| * https://github.com/nghttp2/nghttp2/ -| -| This product contains a modified portion of 'Apache Commons Lang', a Java library -| provides utilities for the java.lang API, which can be obtained at: -| -| * LICENSE: -| * license/LICENSE.commons-lang.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://commons.apache.org/proper/commons-lang/ -| -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains the dnsinfo.h header file, that provides a way to retrieve the system DNS configuration on MacOS. -| This private header is also used by Apple's open source -| mDNSResponder (https://opensource.apple.com/tarballs/mDNSResponder/). -| -| * LICENSE: -| * license/LICENSE.dnsinfo.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://www.opensource.apple.com/source/configd/configd-453.19/dnsinfo/dnsinfo.h - --------------------------------------------------------------------------------- - -This binary artifact includes Project Nessie with the following in its NOTICE -file: - -| Dremio -| Copyright 2015-2017 Dremio Corporation -| -| This product includes software developed at -| The Apache Software Foundation (http://www.apache.org/). - diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java deleted file mode 100644 index d6b0e9c94258..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.expressions.Attribute; -import org.apache.spark.sql.catalyst.expressions.AttributeReference; -import org.apache.spark.sql.catalyst.expressions.Expression; -import org.apache.spark.sql.catalyst.expressions.UnsafeProjection; -import org.apache.spark.sql.types.StructType; -import scala.collection.JavaConverters; - -public class SparkBenchmarkUtil { - - private SparkBenchmarkUtil() {} - - public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { - StructType struct = SparkSchemaUtil.convert(actualSchema); - - List refs = - JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); - List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); - List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); - - for (AttributeReference ref : refs) { - attrs.add(ref.toAttribute()); - } - - for (Types.NestedField field : expectedSchema.columns()) { - int indexInIterSchema = struct.fieldIndex(field.name()); - exprs.add(refs.get(indexInIterSchema)); - } - - return UnsafeProjection.create( - JavaConverters.asScalaBufferConverter(exprs).asScala().toSeq(), - JavaConverters.asScalaBufferConverter(attrs).asScala().toSeq()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java deleted file mode 100644 index 9b0cc5c5e27c..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.spark.SparkBenchmarkUtil; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkParquetReaders; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.UnsafeProjection; -import org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; - -/** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema using - * Iceberg and Spark Parquet readers. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark - * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt - * - */ -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class SparkParquetReadersFlatDataBenchmark { - - private static final DynMethods.UnboundMethod APPLY_PROJECTION = - DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); - private static final Schema SCHEMA = - new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = - new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); - private static final int NUM_RECORDS = 10000000; - private File dataFile; - - @Setup - public void setupBenchmark() throws IOException { - dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); - dataFile.delete(); - List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { - writer.addAll(records); - } - } - - @TearDown - public void tearDownBenchmark() { - if (dataFile != null) { - dataFile.delete(); - } - } - - @Benchmark - @Threads(1) - public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { - - for (InternalRow row : rows) { - blackHole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { - - Iterable unsafeRows = - Iterables.transform( - rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); - - for (InternalRow row : unsafeRows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readUsingSparkReader(Blackhole blackhole) throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = - Iterables.transform( - rows, - APPLY_PROJECTION.bind( - SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) - ::invoke); - - for (InternalRow row : unsafeRows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java deleted file mode 100644 index eafa60b826cc..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.spark.SparkBenchmarkUtil; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkParquetReaders; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.UnsafeProjection; -import org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; -import org.openjdk.jmh.infra.Blackhole; - -/** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark - * Parquet readers. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark - * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt - * - */ -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class SparkParquetReadersNestedDataBenchmark { - - private static final DynMethods.UnboundMethod APPLY_PROJECTION = - DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); - private static final Schema SCHEMA = - new Schema( - required(0, "id", Types.LongType.get()), - optional( - 4, - "nested", - Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get())))); - private static final Schema PROJECTED_SCHEMA = - new Schema( - optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); - private static final int NUM_RECORDS = 10000000; - private File dataFile; - - @Setup - public void setupBenchmark() throws IOException { - dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); - dataFile.delete(); - List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { - writer.addAll(records); - } - } - - @TearDown - public void tearDownBenchmark() { - if (dataFile != null) { - dataFile.delete(); - } - } - - @Benchmark - @Threads(1) - public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { - - Iterable unsafeRows = - Iterables.transform( - rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); - - for (InternalRow row : unsafeRows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readUsingSparkReader(Blackhole blackhole) throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = - Iterables.transform( - rows, - APPLY_PROJECTION.bind( - SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) - ::invoke); - - for (InternalRow row : unsafeRows) { - blackhole.consume(row); - } - } - } - - @Benchmark - @Threads(1) - public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = - Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { - - for (InternalRow row : rows) { - blackhole.consume(row); - } - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java deleted file mode 100644 index c711bfad1a57..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkParquetWriters; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Level; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; - -/** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema using - * Iceberg and Spark Parquet writers. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark - * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt - * - */ -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class SparkParquetWritersFlatDataBenchmark { - - private static final Schema SCHEMA = - new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final int NUM_RECORDS = 1000000; - private Iterable rows; - private File dataFile; - - @Setup - public void setupBenchmark() throws IOException { - rows = RandomData.generateSpark(SCHEMA, NUM_RECORDS, 0L); - dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); - dataFile.delete(); - } - - @TearDown(Level.Iteration) - public void tearDownBenchmark() { - if (dataFile != null) { - dataFile.delete(); - } - } - - @Benchmark - @Threads(1) - public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc( - msgType -> - SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { - - writer.addAll(rows); - } - } - - @Benchmark - @Threads(1) - public void writeUsingSparkWriter() throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { - - writer.addAll(rows); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java deleted file mode 100644 index 794444d9728e..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkParquetWriters; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Level; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; - -/** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark - * Parquet writers. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark - * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt - * - */ -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public class SparkParquetWritersNestedDataBenchmark { - - private static final Schema SCHEMA = - new Schema( - required(0, "id", Types.LongType.get()), - optional( - 4, - "nested", - Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get())))); - private static final int NUM_RECORDS = 1000000; - private Iterable rows; - private File dataFile; - - @Setup - public void setupBenchmark() throws IOException { - rows = RandomData.generateSpark(SCHEMA, NUM_RECORDS, 0L); - dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); - dataFile.delete(); - } - - @TearDown(Level.Iteration) - public void tearDownBenchmark() { - if (dataFile != null) { - dataFile.delete(); - } - } - - @Benchmark - @Threads(1) - public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc( - msgType -> - SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { - - writer.addAll(rows); - } - } - - @Benchmark - @Threads(1) - public void writeUsingSparkWriter() throws IOException { - StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = - Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { - - writer.addAll(rows); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java deleted file mode 100644 index 0dbf07285060..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -@FunctionalInterface -public interface Action { - void invoke(); -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java deleted file mode 100644 index 19bcdd672157..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.IOException; -import java.util.Map; -import java.util.UUID; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.UpdateProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.internal.SQLConf; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.BenchmarkMode; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Mode; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.State; -import org.openjdk.jmh.annotations.Warmup; - -@Fork(1) -@State(Scope.Benchmark) -@Warmup(iterations = 3) -@Measurement(iterations = 5) -@BenchmarkMode(Mode.SingleShotTime) -public abstract class IcebergSourceBenchmark { - - private final Configuration hadoopConf = initHadoopConf(); - private final Table table = initTable(); - private SparkSession spark; - - protected abstract Configuration initHadoopConf(); - - protected final Configuration hadoopConf() { - return hadoopConf; - } - - protected abstract Table initTable(); - - protected final Table table() { - return table; - } - - protected final SparkSession spark() { - return spark; - } - - protected String newTableLocation() { - String tmpDir = hadoopConf.get("hadoop.tmp.dir"); - Path tablePath = new Path(tmpDir, "spark-iceberg-table-" + UUID.randomUUID()); - return tablePath.toString(); - } - - protected String dataLocation() { - Map properties = table.properties(); - return properties.getOrDefault( - TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); - } - - protected void cleanupFiles() throws IOException { - try (FileSystem fileSystem = FileSystem.get(hadoopConf)) { - Path dataPath = new Path(dataLocation()); - fileSystem.delete(dataPath, true); - Path tablePath = new Path(table.location()); - fileSystem.delete(tablePath, true); - } - } - - protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); - if (!enableDictionaryEncoding) { - builder - .config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); - } - builder.master("local"); - spark = builder.getOrCreate(); - Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); - hadoopConf.forEach(entry -> sparkHadoopConf.set(entry.getKey(), entry.getValue())); - } - - protected void setupSpark() { - setupSpark(false); - } - - protected void tearDownSpark() { - spark.stop(); - } - - protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); - } - - protected void appendAsFile(Dataset ds) { - // ensure the schema is precise (including nullability) - StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark - .createDataFrame(ds.rdd(), sparkSchema) - .coalesce(1) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(table.location()); - } - - protected void withSQLConf(Map conf, Action action) { - SQLConf sqlConf = SQLConf.get(); - - Map currentConfValues = Maps.newHashMap(); - conf.keySet() - .forEach( - confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach( - (confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); - - try { - action.invoke(); - } finally { - conf.forEach( - (confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); - } - } - - protected void withTableProperties(Map props, Action action) { - Map tableProps = table.properties(); - Map currentPropValues = Maps.newHashMap(); - props - .keySet() - .forEach( - propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); - - UpdateProperties updateProperties = table.updateProperties(); - props.forEach(updateProperties::set); - updateProperties.commit(); - - try { - action.invoke(); - } finally { - UpdateProperties restoreProperties = table.updateProperties(); - props.forEach( - (propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); - restoreProperties.commit(); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java deleted file mode 100644 index 59e6230350d9..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected final Table initTable() { - Schema schema = - new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); - HadoopTables tables = new HadoopTables(hadoopConf()); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - return tables.create(schema, partitionSpec, properties, newTableLocation()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java deleted file mode 100644 index a1c61b9b4de0..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected final Table initTable() { - Schema schema = - new Schema( - required(0, "id", Types.LongType.get()), - optional( - 4, - "nested", - Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get())))); - PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); - HadoopTables tables = new HadoopTables(hadoopConf()); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - return tables.create(schema, partitionSpec, properties, newTableLocation()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java deleted file mode 100644 index f68b587735dd..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; - -public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected final Table initTable() { - Schema schema = - new Schema( - required(0, "id", Types.LongType.get()), - optional( - 1, - "outerlist", - Types.ListType.ofOptional( - 2, - Types.StructType.of( - required( - 3, - "innerlist", - Types.ListType.ofRequired(4, Types.StringType.get())))))); - PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); - HadoopTables tables = new HadoopTables(hadoopConf()); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - return tables.create(schema, partitionSpec, properties, newTableLocation()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java deleted file mode 100644 index eace9d3e44a7..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.ClusteredDataWriter; -import org.apache.iceberg.io.ClusteredEqualityDeleteWriter; -import org.apache.iceberg.io.ClusteredPositionDeleteWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FanoutDataWriter; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.transforms.Transform; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.infra.Blackhole; - -public abstract class WritersBenchmark extends IcebergSourceBenchmark { - - private static final int NUM_ROWS = 2500000; - private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - - private static final Schema SCHEMA = - new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get())); - - private Iterable rows; - private Iterable positionDeleteRows; - private PartitionSpec unpartitionedSpec; - private PartitionSpec partitionedSpec; - - protected abstract FileFormat fileFormat(); - - @Setup - public void setupBenchmark() { - setupSpark(); - - List data = Lists.newArrayList(RandomData.generateSpark(SCHEMA, NUM_ROWS, 0L)); - Transform transform = Transforms.bucket(Types.IntegerType.get(), 32); - data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); - this.rows = data; - - this.positionDeleteRows = - RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); - - this.unpartitionedSpec = table().specs().get(0); - Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); - this.partitionedSpec = table().specs().get(1); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected final Table initTable() { - HadoopTables tables = new HadoopTables(hadoopConf()); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); - - // add a partitioned spec to the table - table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); - - return table; - } - - @Benchmark - @Threads(1) - public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = - SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); - - ClusteredDataWriter writer = - new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - try (ClusteredDataWriter closeableWriter = writer) { - for (InternalRow row : rows) { - closeableWriter.write(row, unpartitionedSpec, null); - } - } - - blackhole.consume(writer); - } - - @Benchmark - @Threads(1) - public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - - Schema writeSchema = table().schema(); - StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = - SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); - - TaskWriter writer = - new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - try (TaskWriter closableWriter = writer) { - for (InternalRow row : rows) { - closableWriter.write(row); - } - } - - blackhole.consume(writer.complete()); - } - - @Benchmark - @Threads(1) - public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = - SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); - - ClusteredDataWriter writer = - new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); - StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); - InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType); - - try (ClusteredDataWriter closeableWriter = writer) { - for (InternalRow row : rows) { - partitionKey.partition(internalRowWrapper.wrap(row)); - closeableWriter.write(row, partitionedSpec, partitionKey); - } - } - - blackhole.consume(writer); - } - - @Benchmark - @Threads(1) - public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - - Schema writeSchema = table().schema(); - StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = - SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = - new SparkPartitionedWriter( - partitionedSpec, - fileFormat(), - appenders, - fileFactory, - io, - TARGET_FILE_SIZE_IN_BYTES, - writeSchema, - sparkWriteType); - - try (TaskWriter closableWriter = writer) { - for (InternalRow row : rows) { - closableWriter.write(row); - } - } - - blackhole.consume(writer.complete()); - } - - @Benchmark - @Threads(1) - public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = - SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); - - FanoutDataWriter writer = - new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); - StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); - InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType); - - try (FanoutDataWriter closeableWriter = writer) { - for (InternalRow row : rows) { - partitionKey.partition(internalRowWrapper.wrap(row)); - closeableWriter.write(row, partitionedSpec, partitionKey); - } - } - - blackhole.consume(writer); - } - - @Benchmark - @Threads(1) - public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - - Schema writeSchema = table().schema(); - StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = - SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = - new SparkPartitionedFanoutWriter( - partitionedSpec, - fileFormat(), - appenders, - fileFactory, - io, - TARGET_FILE_SIZE_IN_BYTES, - writeSchema, - sparkWriteType); - - try (TaskWriter closableWriter = writer) { - for (InternalRow row : rows) { - closableWriter.write(row); - } - } - - blackhole.consume(writer.complete()); - } - - @Benchmark - @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) - throws IOException { - FileIO io = table().io(); - - int equalityFieldId = table().schema().findField("longCol").fieldId(); - - OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = - SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[] {equalityFieldId}) - .build(); - - ClusteredEqualityDeleteWriter writer = - new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); - StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); - InternalRowWrapper internalRowWrapper = new InternalRowWrapper(deleteSparkType); - - try (ClusteredEqualityDeleteWriter closeableWriter = writer) { - for (InternalRow row : rows) { - partitionKey.partition(internalRowWrapper.wrap(row)); - closeableWriter.write(row, partitionedSpec, partitionKey); - } - } - - blackhole.consume(writer); - } - - @Benchmark - @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) - throws IOException { - FileIO io = table().io(); - - OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = - SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - - ClusteredPositionDeleteWriter writer = - new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); - - PositionDelete positionDelete = PositionDelete.create(); - try (ClusteredPositionDeleteWriter closeableWriter = writer) { - for (InternalRow row : positionDeleteRows) { - String path = row.getString(0); - long pos = row.getLong(1); - positionDelete.set(path, pos, null); - closeableWriter.write(positionDelete, unpartitionedSpec, null); - } - } - - blackhole.consume(writer); - } - - private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java deleted file mode 100644 index 3cdde8d652e4..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.avro; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.spark.source.WritersBenchmark; - -/** - * A benchmark that evaluates the performance of various Iceberg writers for Avro data. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=AvroWritersBenchmark - * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt - * - */ -public class AvroWritersBenchmark extends WritersBenchmark { - - @Override - protected FileFormat fileFormat() { - return FileFormat.AVRO; - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java deleted file mode 100644 index fa4c97ce6229..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.avro; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg - * and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt - * - */ -public class IcebergSourceFlatAvroDataReadBenchmark extends IcebergSourceFlatDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); - } - - private void appendData() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties( - tableProperties, - () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java deleted file mode 100644 index c08fa5c50cd4..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.avro; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg - * and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt - * - */ -public class IcebergSourceNestedAvroDataReadBenchmark extends IcebergSourceNestedDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = - spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); - } - - private void appendData() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties( - tableProperties, - () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3"))); - appendAsFile(df); - } - }); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java deleted file mode 100644 index d0fdd8915780..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.orc; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceBenchmark; -import org.apache.iceberg.types.Types; - -/** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the - * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's - * TIMESTAMP_INSTANT type - */ -public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected final Table initTable() { - Schema schema = - new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not - // support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); - HadoopTables tables = new HadoopTables(hadoopConf()); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - return tables.create(schema, partitionSpec, properties, newTableLocation()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java deleted file mode 100644 index 12accf7b76ed..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.orc; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg - * and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt - * - */ -public class IcebergSourceFlatORCDataReadBenchmark extends IcebergSourceFlatORCDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIcebergNonVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readIcebergVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark() - .read() - .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg") - .load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIcebergNonVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIcebergVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark() - .read() - .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg") - .load(tableLocation) - .select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); - } - - private void appendData() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties( - tableProperties, - () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java deleted file mode 100644 index f57d6764c8ac..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.orc; - -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedListDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the - * built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt - * - */ -public class IcebergSourceNestedListORCDataWriteBenchmark - extends IcebergSourceNestedListDataBenchmark { - - @Setup - public void setupBenchmark() { - setupSpark(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Param({"2000", "20000"}) - private int numRows; - - @Benchmark - @Threads(1) - public void writeIceberg() { - String tableLocation = table().location(); - benchmarkData() - .write() - .format("iceberg") - .option("write-format", "orc") - .mode(SaveMode.Append) - .save(tableLocation); - } - - @Benchmark - @Threads(1) - public void writeIcebergDictionaryOff() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - benchmarkData() - .write() - .format("iceberg") - .option("write-format", "orc") - .mode(SaveMode.Append) - .save(tableLocation); - }); - } - - @Benchmark - @Threads(1) - public void writeFileSource() { - benchmarkData().write().mode(SaveMode.Append).orc(dataLocation()); - } - - private Dataset benchmarkData() { - return spark() - .range(numRows) - .withColumn( - "outerlist", - array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) - .coalesce(1); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java deleted file mode 100644 index d0fe63484f7e..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.orc; - -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.source.IcebergSourceNestedDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg - * and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt - * - */ -public class IcebergSourceNestedORCDataReadBenchmark extends IcebergSourceNestedDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIcebergNonVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readIcebergVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark() - .read() - .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg") - .load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIcebergNonVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIcebergVectorized() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark() - .read() - .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg") - .load(tableLocation) - .selectExpr("nested.col3"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); - } - - private void appendData() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties( - tableProperties, - () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3"))); - appendAsFile(df); - } - }); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java deleted file mode 100644 index 2642c481f8e7..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. - * - *

This class uses a dataset with a flat schema, where the records are clustered according to the - * column used in the filter predicate. - * - *

The performance is compared to the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmhs - * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt - * - */ -public class IcebergSourceFlatParquetDataFilterBenchmark extends IcebergSourceFlatDataBenchmark { - - private static final String FILTER_COND = "dateCol == date_add(current_date(), 1)"; - private static final int NUM_FILES = 500; - private static final int NUM_ROWS = 10000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readWithFilterIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithFilterFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithFilterFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); - } - - private void appendData() { - for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java deleted file mode 100644 index 484572f9a541..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema using - * Iceberg and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt - * - */ -public class IcebergSourceFlatParquetDataReadBenchmark extends IcebergSourceFlatDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); - } - - private void appendData() { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java deleted file mode 100644 index 8e42c49c5e04..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.spark.sql.functions.expr; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema using - * Iceberg and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt - * - */ -public class IcebergSourceFlatParquetDataWriteBenchmark extends IcebergSourceFlatDataBenchmark { - - private static final int NUM_ROWS = 5000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void writeIceberg() { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").mode(SaveMode.Append).save(tableLocation); - } - - @Benchmark - @Threads(1) - public void writeFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip"); - withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation())); - } - - private Dataset benchmarkData() { - return spark() - .range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", expr("DATE_ADD(CURRENT_DATE(), (intCol % 20))")) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) - .coalesce(1); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java deleted file mode 100644 index 099be4c01b54..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedListDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Param; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the - * built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt - * - */ -public class IcebergSourceNestedListParquetDataWriteBenchmark - extends IcebergSourceNestedListDataBenchmark { - - @Setup - public void setupBenchmark() { - setupSpark(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Param({"2000", "20000"}) - private int numRows; - - @Benchmark - @Threads(1) - public void writeIceberg() { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").mode(SaveMode.Append).save(tableLocation); - } - - @Benchmark - @Threads(1) - public void writeFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip"); - withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation())); - } - - private Dataset benchmarkData() { - return spark() - .range(numRows) - .withColumn( - "outerlist", - array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) - .coalesce(1); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java deleted file mode 100644 index eeb84b8efcd9..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. - * - *

This class uses a dataset with nested data, where the records are clustered according to the - * column used in the filter predicate. - * - *

The performance is compared to the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt - * - */ -public class IcebergSourceNestedParquetDataFilterBenchmark - extends IcebergSourceNestedDataBenchmark { - - private static final String FILTER_COND = "nested.col3 == 0"; - private static final int NUM_FILES = 500; - private static final int NUM_ROWS = 10000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readWithFilterIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithFilterFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithFilterFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); - } - - private void appendData() { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3"))); - appendAsFile(df); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java deleted file mode 100644 index c369c75321fd..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the - * built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt - * - */ -public class IcebergSourceNestedParquetDataReadBenchmark extends IcebergSourceNestedDataBenchmark { - - private static final int NUM_FILES = 10; - private static final int NUM_ROWS = 1000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void readIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionIceberg() { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties( - tableProperties, - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readWithProjectionFileSourceNonVectorized() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); - conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf( - conf, - () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); - } - - private void appendData() { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3"))); - appendAsFile(df); - } - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java deleted file mode 100644 index 450ecb709092..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceNestedDataBenchmark; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the - * built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark - * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt - * - */ -public class IcebergSourceNestedParquetDataWriteBenchmark extends IcebergSourceNestedDataBenchmark { - - private static final int NUM_ROWS = 5000000; - - @Setup - public void setupBenchmark() { - setupSpark(); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Benchmark - @Threads(1) - public void writeIceberg() { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").mode(SaveMode.Append).save(tableLocation); - } - - @Benchmark - @Threads(1) - public void writeFileSource() { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip"); - withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation())); - } - - private Dataset benchmarkData() { - return spark() - .range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - expr("id AS col3"))) - .coalesce(1); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java deleted file mode 100644 index d06409e129be..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.spark.source.WritersBenchmark; - -/** - * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=ParquetWritersBenchmark - * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt - * - */ -public class ParquetWritersBenchmark extends WritersBenchmark { - - @Override - protected FileFormat fileFormat() { - return FileFormat.PARQUET; - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java deleted file mode 100644 index 63f35578d14a..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet.vectorized; - -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.util.Map; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.catalyst.expressions.DateAdd; -import org.apache.spark.sql.types.DataTypes; -import org.openjdk.jmh.annotations.Setup; - -/** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema - * using vectorized Iceberg read path and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark - * -PjmhOutputPath=benchmark/results.txt - * - */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark - extends VectorizedReadFlatParquetDataBenchmark { - - @Setup - @Override - public void setupBenchmark() { - setupSpark(true); - appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within - // bounds - System.setProperty("arrow.enable_unsafe_memory_access", "true"); - // Disable expensive null check for every get(index) call. - // Iceberg manages nullability checks itself instead of relying on arrow. - System.setProperty("arrow.enable_null_check_for_get", "false"); - } - - @Override - Map parquetWriteProps() { - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - return properties; - } - - @Override - void appendData() { - Dataset df = idDF(); - df = withLongColumnDictEncoded(df); - df = withIntColumnDictEncoded(df); - df = withFloatColumnDictEncoded(df); - df = withDoubleColumnDictEncoded(df); - df = withDecimalColumnDictEncoded(df); - df = withDateColumnDictEncoded(df); - df = withTimestampColumnDictEncoded(df); - df = withStringColumnDictEncoded(df); - df = df.drop("id"); - df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); - } - - private static Column modColumn() { - return pmod(col("id"), lit(9)); - } - - private static Column date_add(Column start, Column days) { - return new Column(new DateAdd(start.expr(), days.expr())); - } - - private Dataset idDF() { - return spark().range(0, NUM_ROWS_PER_FILE * NUM_FILES, 1, NUM_FILES).toDF(); - } - - private static Dataset withLongColumnDictEncoded(Dataset df) { - return df.withColumn("longCol", modColumn().cast(DataTypes.LongType)); - } - - private static Dataset withIntColumnDictEncoded(Dataset df) { - return df.withColumn("intCol", modColumn().cast(DataTypes.IntegerType)); - } - - private static Dataset withFloatColumnDictEncoded(Dataset df) { - return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } - - private static Dataset withDoubleColumnDictEncoded(Dataset df) { - return df.withColumn("doubleCol", modColumn().cast(DataTypes.DoubleType)); - } - - private static Dataset withDecimalColumnDictEncoded(Dataset df) { - Types.DecimalType type = Types.DecimalType.of(20, 5); - return df.withColumn("decimalCol", lit(bigDecimal(type, 0)).plus(modColumn())); - } - - private static Dataset withDateColumnDictEncoded(Dataset df) { - Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("dateCol", date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days)); - } - - private static Dataset withTimestampColumnDictEncoded(Dataset df) { - Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn( - "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); - } - - private static Dataset withStringColumnDictEncoded(Dataset df) { - return df.withColumn("stringCol", modColumn().cast(DataTypes.StringType)); - } - - private static BigDecimal bigDecimal(Types.DecimalType type, int value) { - BigInteger unscaled = new BigInteger(String.valueOf(value + 1)); - return new BigDecimal(unscaled, type.scale()); - } -} diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java deleted file mode 100644 index feb6c6d5d9eb..000000000000 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source.parquet.vectorized; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - -import java.io.IOException; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.source.IcebergSourceBenchmark; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.internal.SQLConf; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Setup; -import org.openjdk.jmh.annotations.TearDown; -import org.openjdk.jmh.annotations.Threads; - -/** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - * - *

To run this benchmark for spark-2.4: - * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh - * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark - * -PjmhOutputPath=benchmark/results.txt - * - */ -public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchmark { - - static final int NUM_FILES = 5; - static final int NUM_ROWS_PER_FILE = 10_000_000; - - @Setup - public void setupBenchmark() { - setupSpark(); - appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within - // bounds - System.setProperty("arrow.enable_unsafe_memory_access", "true"); - // Disable expensive null check for every get(index) call. - // Iceberg manages nullability checks itself instead of relying on arrow. - System.setProperty("arrow.enable_null_check_for_get", "false"); - } - - @TearDown - public void tearDownBenchmark() throws IOException { - tearDownSpark(); - cleanupFiles(); - } - - @Override - protected Configuration initHadoopConf() { - return new Configuration(); - } - - @Override - protected Table initTable() { - Schema schema = - new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); - HadoopTables tables = new HadoopTables(hadoopConf()); - Map properties = parquetWriteProps(); - return tables.create(schema, partitionSpec, properties, newTableLocation()); - } - - Map parquetWriteProps() { - Map properties = Maps.newHashMap(); - properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); - properties.put(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); - return properties; - } - - void appendData() { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = - spark() - .range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); - appendAsFile(df); - } - } - - @Benchmark - @Threads(1) - public void readIntegersIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readIntegersSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readLongsIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readLongsSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFloatsIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readFloatsSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDoublesIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).select("doubleCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDoublesSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDecimalsIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).select("decimalCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDecimalsSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDatesIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readDatesSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readTimestampsIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).select("timestampCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readTimestampsSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readStringsIcebergVectorized5k() { - withTableProperties( - tablePropsWithVectorizationEnabled(5000), - () -> { - String tableLocation = table().location(); - Dataset df = - spark().read().format("iceberg").load(tableLocation).select("stringCol"); - materialize(df); - }); - } - - @Benchmark - @Threads(1) - public void readStringsSparkVectorized5k() { - withSQLConf( - sparkConfWithVectorizationEnabled(5000), - () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); - } - - private static Map tablePropsWithVectorizationEnabled(int batchSize) { - Map tableProperties = Maps.newHashMap(); - tableProperties.put(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true"); - tableProperties.put(TableProperties.PARQUET_BATCH_SIZE, String.valueOf(batchSize)); - return tableProperties; - } - - private static Map sparkConfWithVectorizationEnabled(int batchSize) { - Map conf = Maps.newHashMap(); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); - conf.put(SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().key(), String.valueOf(batchSize)); - return conf; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java deleted file mode 100644 index 8829e8132cb0..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import org.apache.iceberg.Table; -import org.apache.iceberg.common.DynConstructors; -import org.apache.spark.sql.SparkSession; - -/** - * An API for interacting with actions in Spark. - * - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for - * backward compatibility. This implementation is no longer maintained, the new implementation - * is available with Spark 3 - */ -@Deprecated -public class Actions { - - // Load the actual implementation of Actions via reflection to allow for differences - // between the major Spark APIs while still defining the API in this class. - private static final String IMPL_NAME = "SparkActions"; - private static DynConstructors.Ctor implConstructor; - - private static String implClass() { - return Actions.class.getPackage().getName() + "." + IMPL_NAME; - } - - private static DynConstructors.Ctor actionConstructor() { - if (implConstructor == null) { - String className = implClass(); - try { - implConstructor = - DynConstructors.builder() - .hiddenImpl(className, SparkSession.class, Table.class) - .buildChecked(); - } catch (NoSuchMethodException e) { - throw new IllegalArgumentException( - "Cannot find appropriate Actions implementation on the classpath.", e); - } - } - return implConstructor; - } - - private SparkSession spark; - private Table table; - - protected Actions(SparkSession spark, Table table) { - this.spark = spark; - this.table = table; - } - - /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for - * backward compatibility. This implementation is no longer maintained, the new implementation - * is available with Spark 3 - */ - @Deprecated - public static Actions forTable(SparkSession spark, Table table) { - return actionConstructor().newInstance(spark, table); - } - - /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for - * backward compatibility. This implementation is no longer maintained, the new implementation - * is available with Spark 3 - */ - @Deprecated - public static Actions forTable(Table table) { - return forTable(SparkSession.active(), table); - } - - /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for - * backward compatibility. This implementation is no longer maintained, the new implementation - * is available with Spark 3 - */ - @Deprecated - public RewriteDataFilesAction rewriteDataFiles() { - return new RewriteDataFilesAction(spark, table); - } - - protected SparkSession spark() { - return spark; - } - - protected Table table() { - return table; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java deleted file mode 100644 index 9e89786fc15d..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.List; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.spark.source.RowDataRewriter; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.SparkSession; - -/** - * @deprecated since 0.12.0, keeping this in Spark 2.4 for backward compatibility. This - * implementation is no longer maintained, the new implementation is available with Spark 3 - */ -@Deprecated -public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { - - private final JavaSparkContext sparkContext; - private FileIO fileIO; - - RewriteDataFilesAction(SparkSession spark, Table table) { - super(table); - this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } - - @Override - protected RewriteDataFilesAction self() { - return this; - } - - @Override - protected FileIO fileIO() { - if (this.fileIO == null) { - this.fileIO = SparkUtil.serializableFileIO(table()); - } - return this.fileIO; - } - - @Override - protected List rewriteDataForTasks(List combinedScanTasks) { - JavaRDD taskRDD = - sparkContext.parallelize(combinedScanTasks, combinedScanTasks.size()); - Broadcast tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table())); - RowDataRewriter rowDataRewriter = new RowDataRewriter(tableBroadcast, spec(), caseSensitive()); - return rowDataRewriter.rewriteDataForTasks(taskRDD); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java deleted file mode 100644 index a399c6e4837f..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import org.apache.iceberg.Table; -import org.apache.spark.sql.SparkSession; - -/** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for - * backward compatibility. This implementation is no longer maintained, the new implementation - * is available with Spark 3 - */ -@Deprecated -class SparkActions extends Actions { - protected SparkActions(SparkSession spark, Table table) { - super(spark, table); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java deleted file mode 100644 index 87de0a98b934..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.iceberg.transforms.Transform; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Type; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DataTypes; - -public class IcebergSpark { - private IcebergSpark() {} - - public static void registerBucketUDF( - SparkSession session, String funcName, DataType sourceType, int numBuckets) { - SparkTypeToType typeConverter = new SparkTypeToType(); - Type sourceIcebergType = typeConverter.atomic(sourceType); - Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session - .udf() - .register( - funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), - DataTypes.IntegerType); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java deleted file mode 100644 index c0756d924e2f..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -/** Captures information about the current job which is used for displaying on the UI */ -public class JobGroupInfo { - private String groupId; - private String description; - private boolean interruptOnCancel; - - public JobGroupInfo(String groupId, String desc, boolean interruptOnCancel) { - this.groupId = groupId; - this.description = desc; - this.interruptOnCancel = interruptOnCancel; - } - - public String groupId() { - return groupId; - } - - public String description() { - return description; - } - - public boolean interruptOnCancel() { - return interruptOnCancel; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java deleted file mode 100644 index dc8ba69d40a8..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.spark.SparkContext; -import org.apache.spark.SparkContext$; - -public class JobGroupUtils { - - private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); - private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = - SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - - private JobGroupUtils() {} - - public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { - String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); - String description = sparkContext.getLocalProperty(JOB_GROUP_DESC); - String interruptOnCancel = sparkContext.getLocalProperty(JOB_INTERRUPT_ON_CANCEL); - return new JobGroupInfo(groupId, description, Boolean.parseBoolean(interruptOnCancel)); - } - - public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { - sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); - sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty( - JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java deleted file mode 100644 index cdc0bf5f3cad..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.function.Supplier; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Type.TypeID; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.BooleanType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DateType; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.DoubleType; -import org.apache.spark.sql.types.FloatType; -import org.apache.spark.sql.types.IntegerType; -import org.apache.spark.sql.types.LongType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.TimestampType; - -public class PruneColumnsWithReordering extends TypeUtil.CustomOrderSchemaVisitor { - private final StructType requestedType; - private final Set filterRefs; - private DataType current = null; - - PruneColumnsWithReordering(StructType requestedType, Set filterRefs) { - this.requestedType = requestedType; - this.filterRefs = filterRefs; - } - - @Override - public Type schema(Schema schema, Supplier structResult) { - this.current = requestedType; - try { - return structResult.get(); - } finally { - this.current = null; - } - } - - @Override - public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull( - struct, "Cannot prune null struct. Pruning must start with a schema."); - Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); - StructType requestedStruct = (StructType) current; - - List fields = struct.fields(); - List types = Lists.newArrayList(fieldResults); - - boolean changed = false; - // use a LinkedHashMap to preserve the original order of filter fields that are not projected - Map projectedFields = Maps.newLinkedHashMap(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - Type type = types.get(i); - - if (type == null) { - changed = true; - - } else if (field.type() == type) { - projectedFields.put(field.name(), field); - - } else if (field.isOptional()) { - changed = true; - projectedFields.put( - field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); - - } else { - changed = true; - projectedFields.put( - field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); - } - } - - // Construct a new struct with the projected struct's order - boolean reordered = false; - StructField[] requestedFields = requestedStruct.fields(); - List newFields = Lists.newArrayListWithExpectedSize(requestedFields.length); - for (int i = 0; i < requestedFields.length; i += 1) { - // fields are resolved by name because Spark only sees the current table schema. - String name = requestedFields[i].name(); - if (!fields.get(i).name().equals(name)) { - reordered = true; - } - newFields.add(projectedFields.remove(name)); - } - - // Add remaining filter fields that were not explicitly projected - if (!projectedFields.isEmpty()) { - newFields.addAll(projectedFields.values()); - changed = true; // order probably changed - } - - if (reordered || changed) { - return Types.StructType.of(newFields); - } - - return struct; - } - - @Override - public Type field(Types.NestedField field, Supplier fieldResult) { - Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); - StructType requestedStruct = (StructType) current; - - // fields are resolved by name because Spark only sees the current table schema. - if (requestedStruct.getFieldIndex(field.name()).isEmpty()) { - // make sure that filter fields are projected even if they aren't in the requested schema. - if (filterRefs.contains(field.fieldId())) { - return field.type(); - } - return null; - } - - int fieldIndex = requestedStruct.fieldIndex(field.name()); - StructField requestedField = requestedStruct.fields()[fieldIndex]; - - Preconditions.checkArgument( - requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", - field.name()); - - this.current = requestedField.dataType(); - try { - return fieldResult.get(); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException( - "Invalid projection for field " + field.name() + ": " + e.getMessage(), e); - } finally { - this.current = requestedStruct; - } - } - - @Override - public Type list(Types.ListType list, Supplier elementResult) { - Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); - ArrayType requestedArray = (ArrayType) current; - - Preconditions.checkArgument( - requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", - requestedArray); - - this.current = requestedArray.elementType(); - try { - Type elementType = elementResult.get(); - if (list.elementType() == elementType) { - return list; - } - - // must be a projected element type, create a new list - if (list.isElementOptional()) { - return Types.ListType.ofOptional(list.elementId(), elementType); - } else { - return Types.ListType.ofRequired(list.elementId(), elementType); - } - } finally { - this.current = requestedArray; - } - } - - @Override - public Type map(Types.MapType map, Supplier keyResult, Supplier valueResult) { - Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); - MapType requestedMap = (MapType) current; - - Preconditions.checkArgument( - requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", - map); - Preconditions.checkArgument( - StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", - requestedMap.keyType()); - - this.current = requestedMap.valueType(); - try { - Type valueType = valueResult.get(); - if (map.valueType() == valueType) { - return map; - } - - if (map.isValueOptional()) { - return Types.MapType.ofOptional(map.keyId(), map.valueId(), map.keyType(), valueType); - } else { - return Types.MapType.ofRequired(map.keyId(), map.valueId(), map.keyType(), valueType); - } - } finally { - this.current = requestedMap; - } - } - - @Override - public Type primitive(Type.PrimitiveType primitive) { - Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument( - expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", - primitive, - current); - - // additional checks based on type - switch (primitive.typeId()) { - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument( - requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", - requestedDecimal.scale(), - decimal.scale()); - Preconditions.checkArgument( - requestedDecimal.precision() >= decimal.precision(), - "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), - decimal.precision()); - break; - case TIMESTAMP: - Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument( - timestamp.shouldAdjustToUTC(), - "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); - break; - default: - } - - return primitive; - } - - private static final ImmutableMap> TYPES = - ImmutableMap.>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .buildOrThrow(); -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java deleted file mode 100644 index a6de035c466e..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import java.util.Set; -import java.util.function.Supplier; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Type.TypeID; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.BooleanType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DateType; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.DoubleType; -import org.apache.spark.sql.types.FloatType; -import org.apache.spark.sql.types.IntegerType; -import org.apache.spark.sql.types.LongType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.TimestampType; - -public class PruneColumnsWithoutReordering extends TypeUtil.CustomOrderSchemaVisitor { - private final StructType requestedType; - private final Set filterRefs; - private DataType current = null; - - PruneColumnsWithoutReordering(StructType requestedType, Set filterRefs) { - this.requestedType = requestedType; - this.filterRefs = filterRefs; - } - - @Override - public Type schema(Schema schema, Supplier structResult) { - this.current = requestedType; - try { - return structResult.get(); - } finally { - this.current = null; - } - } - - @Override - public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull( - struct, "Cannot prune null struct. Pruning must start with a schema."); - Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); - - List fields = struct.fields(); - List types = Lists.newArrayList(fieldResults); - - boolean changed = false; - List newFields = Lists.newArrayListWithExpectedSize(types.size()); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - Type type = types.get(i); - - if (type == null) { - changed = true; - - } else if (field.type() == type) { - newFields.add(field); - - } else if (field.isOptional()) { - changed = true; - newFields.add(Types.NestedField.optional(field.fieldId(), field.name(), type)); - - } else { - changed = true; - newFields.add(Types.NestedField.required(field.fieldId(), field.name(), type)); - } - } - - if (changed) { - return Types.StructType.of(newFields); - } - - return struct; - } - - @Override - public Type field(Types.NestedField field, Supplier fieldResult) { - Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); - StructType requestedStruct = (StructType) current; - - // fields are resolved by name because Spark only sees the current table schema. - if (requestedStruct.getFieldIndex(field.name()).isEmpty()) { - // make sure that filter fields are projected even if they aren't in the requested schema. - if (filterRefs.contains(field.fieldId())) { - return field.type(); - } - return null; - } - - int fieldIndex = requestedStruct.fieldIndex(field.name()); - StructField requestedField = requestedStruct.fields()[fieldIndex]; - - Preconditions.checkArgument( - requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", - field.name()); - - this.current = requestedField.dataType(); - try { - return fieldResult.get(); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException( - "Invalid projection for field " + field.name() + ": " + e.getMessage(), e); - } finally { - this.current = requestedStruct; - } - } - - @Override - public Type list(Types.ListType list, Supplier elementResult) { - Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); - ArrayType requestedArray = (ArrayType) current; - - Preconditions.checkArgument( - requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", - requestedArray); - - this.current = requestedArray.elementType(); - try { - Type elementType = elementResult.get(); - if (list.elementType() == elementType) { - return list; - } - - // must be a projected element type, create a new list - if (list.isElementOptional()) { - return Types.ListType.ofOptional(list.elementId(), elementType); - } else { - return Types.ListType.ofRequired(list.elementId(), elementType); - } - } finally { - this.current = requestedArray; - } - } - - @Override - public Type map(Types.MapType map, Supplier keyResult, Supplier valueResult) { - Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); - MapType requestedMap = (MapType) current; - - Preconditions.checkArgument( - requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", - map); - - this.current = requestedMap.valueType(); - try { - Type valueType = valueResult.get(); - if (map.valueType() == valueType) { - return map; - } - - if (map.isValueOptional()) { - return Types.MapType.ofOptional(map.keyId(), map.valueId(), map.keyType(), valueType); - } else { - return Types.MapType.ofRequired(map.keyId(), map.valueId(), map.keyType(), valueType); - } - } finally { - this.current = requestedMap; - } - } - - @Override - public Type primitive(Type.PrimitiveType primitive) { - Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument( - expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", - primitive, - current); - - // additional checks based on type - switch (primitive.typeId()) { - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument( - requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", - requestedDecimal.scale(), - decimal.scale()); - Preconditions.checkArgument( - requestedDecimal.precision() >= decimal.precision(), - "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), - decimal.precision()); - break; - default: - } - - return primitive; - } - - private static final ImmutableMap> TYPES = - ImmutableMap.>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .buildOrThrow(); -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java deleted file mode 100644 index 33e5ca936800..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.Locale; -import java.util.Map; -import java.util.function.Function; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.spark.sql.RuntimeConfig; -import org.apache.spark.sql.SparkSession; - -class SparkConfParser { - - private final Map properties; - private final RuntimeConfig sessionConf; - private final Map options; - - SparkConfParser(SparkSession spark, Table table, Map options) { - this.properties = table.properties(); - this.sessionConf = spark.conf(); - this.options = options; - } - - public BooleanConfParser booleanConf() { - return new BooleanConfParser(); - } - - public IntConfParser intConf() { - return new IntConfParser(); - } - - public LongConfParser longConf() { - return new LongConfParser(); - } - - public StringConfParser stringConf() { - return new StringConfParser(); - } - - class BooleanConfParser extends ConfParser { - private Boolean defaultValue; - - @Override - protected BooleanConfParser self() { - return this; - } - - public BooleanConfParser defaultValue(boolean value) { - this.defaultValue = value; - return self(); - } - - public boolean parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Boolean::parseBoolean, defaultValue); - } - } - - class IntConfParser extends ConfParser { - private Integer defaultValue; - - @Override - protected IntConfParser self() { - return this; - } - - public IntConfParser defaultValue(int value) { - this.defaultValue = value; - return self(); - } - - public int parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Integer::parseInt, defaultValue); - } - } - - class LongConfParser extends ConfParser { - private Long defaultValue; - - @Override - protected LongConfParser self() { - return this; - } - - public LongConfParser defaultValue(long value) { - this.defaultValue = value; - return self(); - } - - public long parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Long::parseLong, defaultValue); - } - - public Long parseOptional() { - return parse(Long::parseLong, null); - } - } - - class StringConfParser extends ConfParser { - private String defaultValue; - - @Override - protected StringConfParser self() { - return this; - } - - public StringConfParser defaultValue(String value) { - this.defaultValue = value; - return self(); - } - - public String parse() { - Preconditions.checkArgument(defaultValue != null, "Default value cannot be null"); - return parse(Function.identity(), defaultValue); - } - } - - abstract class ConfParser { - private String optionName; - private String sessionConfName; - private String tablePropertyName; - - protected abstract ThisT self(); - - public ThisT option(String name) { - this.optionName = name; - return self(); - } - - public ThisT sessionConf(String name) { - this.sessionConfName = name; - return self(); - } - - public ThisT tableProperty(String name) { - this.tablePropertyName = name; - return self(); - } - - protected T parse(Function conversion, T defaultValue) { - if (optionName != null) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case - // map - String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); - if (optionValue != null) { - return conversion.apply(optionValue); - } - } - - if (sessionConfName != null) { - String sessionConfValue = sessionConf.get(sessionConfName, null); - if (sessionConfValue != null) { - return conversion.apply(sessionConfValue); - } - } - - if (tablePropertyName != null) { - String propertyValue = properties.get(tablePropertyName); - if (propertyValue != null) { - return conversion.apply(propertyValue); - } - } - - return defaultValue; - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java deleted file mode 100644 index 2cb7f0eba646..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.StructType; - -public class SparkDataFile implements DataFile { - - private final int filePathPosition; - private final int fileFormatPosition; - private final int partitionPosition; - private final int recordCountPosition; - private final int fileSizeInBytesPosition; - private final int columnSizesPosition; - private final int valueCountsPosition; - private final int nullValueCountsPosition; - private final int nanValueCountsPosition; - private final int lowerBoundsPosition; - private final int upperBoundsPosition; - private final int keyMetadataPosition; - private final int splitOffsetsPosition; - private final int sortOrderIdPosition; - private final Type lowerBoundsType; - private final Type upperBoundsType; - private final Type keyMetadataType; - - private final SparkStructLike wrappedPartition; - private Row wrapped; - - public SparkDataFile(Types.StructType type, StructType sparkType) { - this.lowerBoundsType = type.fieldType("lower_bounds"); - this.upperBoundsType = type.fieldType("upper_bounds"); - this.keyMetadataType = type.fieldType("key_metadata"); - this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); - - Map positions = Maps.newHashMap(); - type.fields() - .forEach( - field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); - - filePathPosition = positions.get("file_path"); - fileFormatPosition = positions.get("file_format"); - partitionPosition = positions.get("partition"); - recordCountPosition = positions.get("record_count"); - fileSizeInBytesPosition = positions.get("file_size_in_bytes"); - columnSizesPosition = positions.get("column_sizes"); - valueCountsPosition = positions.get("value_counts"); - nullValueCountsPosition = positions.get("null_value_counts"); - nanValueCountsPosition = positions.get("nan_value_counts"); - lowerBoundsPosition = positions.get("lower_bounds"); - upperBoundsPosition = positions.get("upper_bounds"); - keyMetadataPosition = positions.get("key_metadata"); - splitOffsetsPosition = positions.get("split_offsets"); - sortOrderIdPosition = positions.get("sort_order_id"); - } - - public SparkDataFile wrap(Row row) { - this.wrapped = row; - if (wrappedPartition.size() > 0) { - this.wrappedPartition.wrap(row.getAs(partitionPosition)); - } - return this; - } - - @Override - public Long pos() { - return null; - } - - @Override - public int specId() { - return -1; - } - - @Override - public CharSequence path() { - return wrapped.getAs(filePathPosition); - } - - @Override - public FileFormat format() { - return FileFormat.fromString(wrapped.getString(fileFormatPosition)); - } - - @Override - public StructLike partition() { - return wrappedPartition; - } - - @Override - public long recordCount() { - return wrapped.getAs(recordCountPosition); - } - - @Override - public long fileSizeInBytes() { - return wrapped.getAs(fileSizeInBytesPosition); - } - - @Override - public Map columnSizes() { - return wrapped.isNullAt(columnSizesPosition) ? null : wrapped.getJavaMap(columnSizesPosition); - } - - @Override - public Map valueCounts() { - return wrapped.isNullAt(valueCountsPosition) ? null : wrapped.getJavaMap(valueCountsPosition); - } - - @Override - public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) - ? null - : wrapped.getJavaMap(nullValueCountsPosition); - } - - @Override - public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) - ? null - : wrapped.getJavaMap(nanValueCountsPosition); - } - - @Override - public Map lowerBounds() { - Map lowerBounds = - wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); - return convert(lowerBoundsType, lowerBounds); - } - - @Override - public Map upperBounds() { - Map upperBounds = - wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); - return convert(upperBoundsType, upperBounds); - } - - @Override - public ByteBuffer keyMetadata() { - return convert(keyMetadataType, wrapped.get(keyMetadataPosition)); - } - - @Override - public DataFile copy() { - throw new UnsupportedOperationException("Not implemented: copy"); - } - - @Override - public DataFile copyWithoutStats() { - throw new UnsupportedOperationException("Not implemented: copyWithoutStats"); - } - - @Override - public List splitOffsets() { - return wrapped.isNullAt(splitOffsetsPosition) ? null : wrapped.getList(splitOffsetsPosition); - } - - @Override - public Integer sortOrderId() { - return wrapped.getAs(sortOrderIdPosition); - } - - private int fieldPosition(String name, StructType sparkType) { - try { - return sparkType.fieldIndex(name); - } catch (IllegalArgumentException e) { - // the partition field is absent for unpartitioned tables - if (name.equals("partition") && wrappedPartition.size() == 0) { - return -1; - } - throw e; - } - } - - @SuppressWarnings("unchecked") - private T convert(Type valueType, Object value) { - return (T) SparkValueConverter.convert(valueType, value); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java deleted file mode 100644 index 5c6fe3e0ff96..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import com.google.errorprone.annotations.FormatMethod; -import java.io.IOException; -import org.apache.iceberg.exceptions.NoSuchNamespaceException; -import org.apache.iceberg.exceptions.NoSuchTableException; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.spark.sql.AnalysisException; - -public class SparkExceptionUtil { - - private SparkExceptionUtil() {} - - /** - * Converts checked exceptions to unchecked exceptions. - * - * @param cause a checked exception object which is to be converted to its unchecked equivalent. - * @param message exception message as a format string - * @param args format specifiers - * @return unchecked exception. - */ - @FormatMethod - public static RuntimeException toUncheckedException( - final Throwable cause, final String message, final Object... args) { - // Parameters are required to be final to help @FormatMethod do static analysis - if (cause instanceof RuntimeException) { - return (RuntimeException) cause; - - } else if (cause instanceof org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException) { - return new NoSuchNamespaceException(cause, message, args); - - } else if (cause instanceof org.apache.spark.sql.catalyst.analysis.NoSuchTableException) { - return new NoSuchTableException(cause, message, args); - - } else if (cause instanceof AnalysisException) { - return new ValidationException(cause, message, args); - - } else if (cause instanceof IOException) { - return new RuntimeIOException((IOException) cause, message, args); - - } else { - return new RuntimeException(String.format(message, args), cause); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java deleted file mode 100644 index 48be75e6ea26..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - -import java.sql.Date; -import java.sql.Timestamp; -import java.util.Objects; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expression.Operation; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.util.NaNUtil; -import org.apache.spark.sql.catalyst.util.DateTimeUtils; -import org.apache.spark.sql.sources.And; -import org.apache.spark.sql.sources.EqualNullSafe; -import org.apache.spark.sql.sources.EqualTo; -import org.apache.spark.sql.sources.Filter; -import org.apache.spark.sql.sources.GreaterThan; -import org.apache.spark.sql.sources.GreaterThanOrEqual; -import org.apache.spark.sql.sources.In; -import org.apache.spark.sql.sources.IsNotNull; -import org.apache.spark.sql.sources.IsNull; -import org.apache.spark.sql.sources.LessThan; -import org.apache.spark.sql.sources.LessThanOrEqual; -import org.apache.spark.sql.sources.Not; -import org.apache.spark.sql.sources.Or; -import org.apache.spark.sql.sources.StringStartsWith; - -public class SparkFilters { - private SparkFilters() {} - - private static final ImmutableMap, Operation> FILTERS = - ImmutableMap., Operation>builder() - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .buildOrThrow(); - - public static Expression convert(Filter filter) { - // avoid using a chain of if instanceof statements by mapping to the expression enum. - Operation op = FILTERS.get(filter.getClass()); - if (op != null) { - switch (op) { - case IS_NULL: - IsNull isNullFilter = (IsNull) filter; - return isNull(isNullFilter.attribute()); - - case NOT_NULL: - IsNotNull notNullFilter = (IsNotNull) filter; - return notNull(notNullFilter.attribute()); - - case LT: - LessThan lt = (LessThan) filter; - return lessThan(lt.attribute(), convertLiteral(lt.value())); - - case LT_EQ: - LessThanOrEqual ltEq = (LessThanOrEqual) filter; - return lessThanOrEqual(ltEq.attribute(), convertLiteral(ltEq.value())); - - case GT: - GreaterThan gt = (GreaterThan) filter; - return greaterThan(gt.attribute(), convertLiteral(gt.value())); - - case GT_EQ: - GreaterThanOrEqual gtEq = (GreaterThanOrEqual) filter; - return greaterThanOrEqual(gtEq.attribute(), convertLiteral(gtEq.value())); - - case EQ: // used for both eq and null-safe-eq - if (filter instanceof EqualTo) { - EqualTo eq = (EqualTo) filter; - // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull( - eq.value(), "Expression is always false (eq is not null-safe): %s", filter); - return handleEqual(eq.attribute(), eq.value()); - } else { - EqualNullSafe eq = (EqualNullSafe) filter; - if (eq.value() == null) { - return isNull(eq.attribute()); - } else { - return handleEqual(eq.attribute(), eq.value()); - } - } - - case IN: - In inFilter = (In) filter; - return in( - inFilter.attribute(), - Stream.of(inFilter.values()) - .filter(Objects::nonNull) - .map(SparkFilters::convertLiteral) - .collect(Collectors.toList())); - - case NOT: - Not notFilter = (Not) filter; - Expression child = convert(notFilter.child()); - if (child != null) { - return not(child); - } - return null; - - case AND: - { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); - } - return null; - } - - case OR: - { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); - } - return null; - } - - case STARTS_WITH: - { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); - } - } - } - - return null; - } - - private static Object convertLiteral(Object value) { - if (value instanceof Timestamp) { - return DateTimeUtils.fromJavaTimestamp((Timestamp) value); - } else if (value instanceof Date) { - return DateTimeUtils.fromJavaDate((Date) value); - } - return value; - } - - private static Expression handleEqual(String attribute, Object value) { - if (NaNUtil.isNaN(value)) { - return isNaN(attribute); - } else { - return equal(attribute, convertLiteral(value)); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java deleted file mode 100644 index b35213501aef..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.FixupTypes; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; - -/** - * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link - * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} - * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} - * to 'true' - */ -class SparkFixupTimestampType extends FixupTypes { - - private SparkFixupTimestampType(Schema referenceSchema) { - super(referenceSchema); - } - - static Schema fixup(Schema schema) { - return new Schema( - TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); - } - - @Override - public Type primitive(Type.PrimitiveType primitive) { - if (primitive.typeId() == Type.TypeID.TIMESTAMP) { - return Types.TimestampType.withoutZone(); - } - return primitive; - } - - @Override - protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { - return Type.TypeID.TIMESTAMP.equals(type.typeId()); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java deleted file mode 100644 index 6c4ec39b20f1..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.FixupTypes; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; - -/** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can - * produce only one, which may not be correct. - */ -class SparkFixupTypes extends FixupTypes { - - private SparkFixupTypes(Schema referenceSchema) { - super(referenceSchema); - } - - static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema( - TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); - } - - @Override - protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { - switch (type.typeId()) { - case STRING: - if (source.typeId() == Type.TypeID.UUID) { - return true; - } - break; - case BINARY: - if (source.typeId() == Type.TypeID.FIXED) { - return true; - } - break; - case TIMESTAMP: - if (source.typeId() == Type.TypeID.TIMESTAMP) { - return true; - } - break; - default: - } - return false; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java deleted file mode 100644 index f6fdb0df6d83..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.Map; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.spark.sql.SparkSession; - -/** - * A class for common Iceberg configs for Spark reads. - * - *

If a config is set at multiple levels, the following order of precedence is used (top to - * bottom): - * - *

    - *
  1. Read options - *
  2. Session configuration - *
  3. Table metadata - *
- * - * The most specific value is set in read options and takes precedence over all other configs. If no - * read option is provided, this class checks the session configuration for any overrides. If no - * applicable value is found in the session configuration, this class uses the table metadata. - * - *

Note this class is NOT meant to be serialized and sent to executors. - */ -public class SparkReadConf { - - private final Table table; - private final Map readOptions; - private final SparkConfParser confParser; - - public SparkReadConf(SparkSession spark, Table table, Map readOptions) { - this.table = table; - this.readOptions = readOptions; - this.confParser = new SparkConfParser(spark, table, readOptions); - } - - public boolean localityEnabled() { - boolean defaultValue = Util.mayHaveBlockLocations(table.io(), table.location()); - return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); - } - - public Long snapshotId() { - return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); - } - - public Long asOfTimestamp() { - return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); - } - - public Long startSnapshotId() { - return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); - } - - public Long endSnapshotId() { - return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); - } - - public boolean parquetVectorizationEnabled() { - return confParser - .booleanConf() - .option(SparkReadOptions.VECTORIZATION_ENABLED) - .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) - .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) - .defaultValue(TableProperties.PARQUET_VECTORIZATION_ENABLED_DEFAULT) - .parse(); - } - - public int parquetBatchSize() { - return confParser - .intConf() - .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) - .tableProperty(TableProperties.PARQUET_BATCH_SIZE) - .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) - .parse(); - } - - public boolean orcVectorizationEnabled() { - return confParser - .booleanConf() - .option(SparkReadOptions.VECTORIZATION_ENABLED) - .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) - .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) - .defaultValue(TableProperties.ORC_VECTORIZATION_ENABLED_DEFAULT) - .parse(); - } - - public int orcBatchSize() { - return confParser - .intConf() - .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) - .tableProperty(TableProperties.ORC_BATCH_SIZE) - .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) - .parse(); - } - - public long splitSize() { - return confParser - .longConf() - .option(SparkReadOptions.SPLIT_SIZE) - .tableProperty(TableProperties.SPLIT_SIZE) - .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) - .parse(); - } - - public int splitLookback() { - return confParser - .intConf() - .option(SparkReadOptions.LOOKBACK) - .tableProperty(TableProperties.SPLIT_LOOKBACK) - .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) - .parse(); - } - - public long splitOpenFileCost() { - return confParser - .longConf() - .option(SparkReadOptions.FILE_OPEN_COST) - .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) - .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) - .parse(); - } - - /** - * Enables reading a timestamp without time zone as a timestamp with time zone. - * - *

Generally, this is not safe as a timestamp without time zone is supposed to represent the - * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so - * that the corresponding time in the reader timezone is displayed. - * - *

When set to false (default), an exception must be thrown while reading a timestamp without - * time zone. - * - * @return boolean indicating if reading timestamps without timezone is allowed - */ - public boolean handleTimestampWithoutZone() { - return confParser - .booleanConf() - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) - .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) - .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) - .parse(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java deleted file mode 100644 index 0b5c5902fa99..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -/** Spark DF read options */ -public class SparkReadOptions { - - private SparkReadOptions() {} - - // Snapshot ID of the table snapshot to read - public static final String SNAPSHOT_ID = "snapshot-id"; - - // Start snapshot ID used in incremental scans (exclusive) - public static final String START_SNAPSHOT_ID = "start-snapshot-id"; - - // End snapshot ID used in incremental scans (inclusive) - public static final String END_SNAPSHOT_ID = "end-snapshot-id"; - - // A timestamp in milliseconds; the snapshot used will be the snapshot current at this time. - public static final String AS_OF_TIMESTAMP = "as-of-timestamp"; - - // Overrides the table's read.split.target-size and read.split.metadata-target-size - public static final String SPLIT_SIZE = "split-size"; - - // Overrides the table's read.split.planning-lookback - public static final String LOOKBACK = "lookback"; - - // Overrides the table's read.split.open-file-cost - public static final String FILE_OPEN_COST = "file-open-cost"; - - // Overrides the table's read.split.open-file-cost - public static final String VECTORIZATION_ENABLED = "vectorization-enabled"; - - // Overrides the table's read.parquet.vectorization.batch-size - public static final String VECTORIZATION_BATCH_SIZE = "batch-size"; - - // Set ID that is used to fetch file scan tasks - public static final String FILE_SCAN_TASK_SET_ID = "file-scan-task-set-id"; - - // skip snapshots of type delete while reading stream out of iceberg table - public static final String STREAMING_SKIP_DELETE_SNAPSHOTS = "streaming-skip-delete-snapshots"; - - // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = - "handle-timestamp-without-timezone"; - - // Controls whether to report locality information to Spark while allocating input partitions - public static final String LOCALITY = "locality"; -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java deleted file mode 100644 index fa8bd719f391..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -public class SparkSQLProperties { - - private SparkSQLProperties() {} - - // Controls whether vectorized reads are enabled - public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; - - // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = - "spark.sql.iceberg.handle-timestamp-without-timezone"; - public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; - - // Controls whether timestamp types for new tables should be stored with timezone info - public static final String USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES = - "spark.sql.iceberg.use-timestamp-without-timezone-in-new-tables"; - public static final boolean USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES_DEFAULT = false; - - // Controls whether to perform the nullability check during writes - public static final String CHECK_NULLABILITY = "spark.sql.iceberg.check-nullability"; - public static final boolean CHECK_NULLABILITY_DEFAULT = true; - - // Controls whether to check the order of fields during writes - public static final String CHECK_ORDERING = "spark.sql.iceberg.check-ordering"; - public static final boolean CHECK_ORDERING_DEFAULT = true; -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java deleted file mode 100644 index 653987e654aa..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Binder; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.math.LongMath; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.AnalysisException; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalog.Column; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.StructType; - -/** Helper methods for working with Spark/Hive metadata. */ -public class SparkSchemaUtil { - private SparkSchemaUtil() {} - - /** - * Returns a {@link Schema} for the given table with fresh field ids. - * - *

This creates a Schema for an existing table by looking up the table's schema with Spark and - * converting that schema. Spark/Hive partition columns are included in the schema. - * - * @param spark a Spark session - * @param name a table name and (optional) database - * @return a Schema for the table, if found - */ - public static Schema schemaForTable(SparkSession spark, String name) { - StructType sparkType = spark.table(name).schema(); - Type converted = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)); - return new Schema(converted.asNestedType().asStructType().fields()); - } - - /** - * Returns a {@link PartitionSpec} for the given table. - * - *

This creates a partition spec for an existing table by looking up the table's schema and - * creating a spec with identity partitions for each partition column. - * - * @param spark a Spark session - * @param name a table name and (optional) database - * @return a PartitionSpec for the table - * @throws AnalysisException if thrown by the Spark catalog - */ - public static PartitionSpec specForTable(SparkSession spark, String name) - throws AnalysisException { - List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); - String db = parts.size() == 1 ? "default" : parts.get(0); - String table = parts.get(parts.size() == 1 ? 0 : 1); - - PartitionSpec spec = - identitySpec( - schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); - return spec == null ? PartitionSpec.unpartitioned() : spec; - } - - /** - * Convert a {@link Schema} to a {@link DataType Spark type}. - * - * @param schema a Schema - * @return the equivalent Spark type - * @throws IllegalArgumentException if the type cannot be converted to Spark - */ - public static StructType convert(Schema schema) { - return (StructType) TypeUtil.visit(schema, new TypeToSparkType()); - } - - /** - * Convert a {@link Type} to a {@link DataType Spark type}. - * - * @param type a Type - * @return the equivalent Spark type - * @throws IllegalArgumentException if the type cannot be converted to Spark - */ - public static DataType convert(Type type) { - return TypeUtil.visit(type, new TypeToSparkType()); - } - - /** - * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - * - *

This conversion assigns fresh ids. - * - *

Some data types are represented as the same Spark type. These are converted to a default - * type. - * - *

To convert using a reference schema for field ids and ambiguous types, use {@link - * #convert(Schema, StructType)}. - * - * @param sparkType a Spark StructType - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted - */ - public static Schema convert(StructType sparkType) { - return convert(sparkType, false); - } - - /** - * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - * - *

This conversion assigns fresh ids. - * - *

Some data types are represented as the same Spark type. These are converted to a default - * type. - * - *

To convert using a reference schema for field ids and ambiguous types, use {@link - * #convert(Schema, StructType)}. - * - * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without - * timezone - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted - */ - public static Schema convert(StructType sparkType, boolean useTimestampWithoutZone) { - Type converted = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)); - Schema schema = new Schema(converted.asNestedType().asStructType().fields()); - if (useTimestampWithoutZone) { - schema = SparkFixupTimestampType.fixup(schema); - } - return schema; - } - - /** - * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - * - *

This conversion assigns fresh ids. - * - *

Some data types are represented as the same Spark type. These are converted to a default - * type. - * - *

To convert using a reference schema for field ids and ambiguous types, use {@link - * #convert(Schema, StructType)}. - * - * @param sparkType a Spark DataType - * @return the equivalent Type - * @throws IllegalArgumentException if the type cannot be converted - */ - public static Type convert(DataType sparkType) { - return SparkTypeVisitor.visit(sparkType, new SparkTypeToType()); - } - - /** - * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - * - *

This conversion does not assign new ids; it uses ids from the base schema. - * - *

Data types, field order, and nullability will match the spark type. This conversion may - * return a schema that is not compatible with base schema. - * - * @param baseSchema a Schema on which conversion is based - * @param sparkType a Spark StructType - * @return the equivalent Schema - * @throws IllegalArgumentException if the type cannot be converted or there are missing ids - */ - public static Schema convert(Schema baseSchema, StructType sparkType) { - // convert to a type with fresh ids - Types.StructType struct = - SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); - // reassign ids to match the base schema - Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); - // fix types that can't be represented in Spark (UUID and Fixed) - return SparkFixupTypes.fixup(schema, baseSchema); - } - - /** - * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - * - *

This requires that the Spark type is a projection of the Schema. Nullability and types must - * match. - * - * @param schema a Schema - * @param requestedType a projection of the Spark representation of the Schema - * @return a Schema corresponding to the Spark projection - * @throws IllegalArgumentException if the Spark type does not match the Schema - */ - public static Schema prune(Schema schema, StructType requestedType) { - return new Schema( - TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); - } - - /** - * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - * - *

This requires that the Spark type is a projection of the Schema. Nullability and types must - * match. - * - *

The filters list of {@link Expression} is used to ensure that columns referenced by filters - * are projected. - * - * @param schema a Schema - * @param requestedType a projection of the Spark representation of the Schema - * @param filters a list of filters - * @return a Schema corresponding to the Spark projection - * @throws IllegalArgumentException if the Spark type does not match the Schema - */ - public static Schema prune(Schema schema, StructType requestedType, List filters) { - Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema( - TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); - } - - /** - * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - * - *

This requires that the Spark type is a projection of the Schema. Nullability and types must - * match. - * - *

The filters list of {@link Expression} is used to ensure that columns referenced by filters - * are projected. - * - * @param schema a Schema - * @param requestedType a projection of the Spark representation of the Schema - * @param filter a filters - * @return a Schema corresponding to the Spark projection - * @throws IllegalArgumentException if the Spark type does not match the Schema - */ - public static Schema prune( - Schema schema, StructType requestedType, Expression filter, boolean caseSensitive) { - Set filterRefs = - Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - - return new Schema( - TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); - } - - private static PartitionSpec identitySpec(Schema schema, Collection columns) { - List names = Lists.newArrayList(); - for (Column column : columns) { - if (column.isPartition()) { - names.add(column.name()); - } - } - - return identitySpec(schema, names); - } - - private static PartitionSpec identitySpec(Schema schema, List partitionNames) { - if (partitionNames == null || partitionNames.isEmpty()) { - return null; - } - - PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); - for (String partitionName : partitionNames) { - builder.identity(partitionName); - } - - return builder.build(); - } - - /** - * Estimate approximate table size based on Spark schema and total records. - * - * @param tableSchema Spark schema - * @param totalRecords total records in the table - * @return approximate size based on table schema - */ - public static long estimateSize(StructType tableSchema, long totalRecords) { - if (totalRecords == Long.MAX_VALUE) { - return totalRecords; - } - - long result; - try { - result = LongMath.checkedMultiply(tableSchema.defaultSize(), totalRecords); - } catch (ArithmeticException e) { - result = Long.MAX_VALUE; - } - return result; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java deleted file mode 100644 index 77cfa0f34c63..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.iceberg.StructLike; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; - -public class SparkStructLike implements StructLike { - - private final Types.StructType type; - private Row wrapped; - - public SparkStructLike(Types.StructType type) { - this.type = type; - } - - public SparkStructLike wrap(Row row) { - this.wrapped = row; - return this; - } - - @Override - public int size() { - return type.fields().size(); - } - - @Override - public T get(int pos, Class javaClass) { - Types.NestedField field = type.fields().get(pos); - return javaClass.cast(SparkValueConverter.convert(field.type(), wrapped.get(pos))); - } - - @Override - public void set(int pos, T value) { - throw new UnsupportedOperationException("Not implemented: set"); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java deleted file mode 100644 index 584468fa006c..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import static org.apache.spark.sql.functions.col; - -import java.io.IOException; -import java.io.Serializable; -import java.net.URI; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.data.TableMigrationUtil; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.TaskContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapPartitionsFunction; -import org.apache.spark.sql.AnalysisException; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.DataFrameReader; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.TableIdentifier; -import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException; -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute; -import org.apache.spark.sql.catalyst.catalog.CatalogTable; -import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition; -import org.apache.spark.sql.catalyst.catalog.SessionCatalog; -import org.apache.spark.sql.catalyst.expressions.Expression; -import org.apache.spark.sql.catalyst.expressions.NamedExpression; -import org.apache.spark.sql.catalyst.parser.ParseException; -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.Function2; -import scala.Option; -import scala.Predef; -import scala.Some; -import scala.Tuple2; -import scala.collection.JavaConverters; -import scala.collection.Seq; -import scala.runtime.AbstractPartialFunction; - -/** - * Java version of the original SparkTableUtil.scala - * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala - */ -public class SparkTableUtil { - - private static final Logger LOG = LoggerFactory.getLogger(SparkTableUtil.class); - - private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - - private static final PathFilter HIDDEN_PATH_FILTER = - p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - - private static final String duplicateFileMessage = - "Cannot complete import because data files " - + "to be imported already exist within the target table: %s. " - + "This is disabled by default as Iceberg is not designed for mulitple references to the same file" - + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - - private SparkTableUtil() {} - - /** - * Returns a DataFrame with a row for each partition in the table. - * - *

The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro - * or parquet). - * - * @param spark a Spark session - * @param table a table name and (optional) database - * @return a DataFrame of the table's partitions - */ - public static Dataset partitionDF(SparkSession spark, String table) { - List partitions = getPartitions(spark, table); - return spark - .createDataFrame(partitions, SparkPartition.class) - .toDF("partition", "uri", "format"); - } - - /** - * Returns a DataFrame with a row for each partition that matches the specified 'expression'. - * - * @param spark a Spark session. - * @param table name of the table. - * @param expression The expression whose matching partitions are returned. - * @return a DataFrame of the table partitions. - */ - public static Dataset partitionDFByFilter( - SparkSession spark, String table, String expression) { - List partitions = getPartitionsByFilter(spark, table, expression); - return spark - .createDataFrame(partitions, SparkPartition.class) - .toDF("partition", "uri", "format"); - } - - /** - * Returns all partitions in the table. - * - * @param spark a Spark session - * @param table a table name and (optional) database - * @return all table's partitions - */ - public static List getPartitions(SparkSession spark, String table) { - try { - TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); - return getPartitions(spark, tableIdent, null); - } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unable to parse table identifier: %s", table); - } - } - - /** - * Returns all partitions in the table. - * - * @param spark a Spark session - * @param tableIdent a table identifier - * @param partitionFilter partition filter, or null if no filter - * @return all table's partitions - */ - public static List getPartitions( - SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { - try { - SessionCatalog catalog = spark.sessionState().catalog(); - CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); - - Option> scalaPartitionFilter; - if (partitionFilter != null && !partitionFilter.isEmpty()) { - scalaPartitionFilter = - Option.apply( - JavaConverters.mapAsScalaMapConverter(partitionFilter) - .asScala() - .toMap(Predef.conforms())); - } else { - scalaPartitionFilter = Option.empty(); - } - Seq partitions = - catalog.listPartitions(tableIdent, scalaPartitionFilter); - return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() - .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) - .collect(Collectors.toList()); - } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Database not found in catalog.", tableIdent); - } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Table not found in catalog.", tableIdent); - } - } - - /** - * Returns partitions that match the specified 'predicate'. - * - * @param spark a Spark session - * @param table a table name and (optional) database - * @param predicate a predicate on partition columns - * @return matching table's partitions - */ - public static List getPartitionsByFilter( - SparkSession spark, String table, String predicate) { - TableIdentifier tableIdent; - try { - tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); - } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unable to parse the table identifier: %s", table); - } - - Expression unresolvedPredicateExpr; - try { - unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); - } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unable to parse the predicate expression: %s", predicate); - } - - Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); - return getPartitionsByFilter(spark, tableIdent, resolvedPredicateExpr); - } - - /** - * Returns partitions that match the specified 'predicate'. - * - * @param spark a Spark session - * @param tableIdent a table identifier - * @param predicateExpr a predicate expression on partition columns - * @return matching table's partitions - */ - public static List getPartitionsByFilter( - SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { - try { - SessionCatalog catalog = spark.sessionState().catalog(); - CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); - - Expression resolvedPredicateExpr; - if (!predicateExpr.resolved()) { - resolvedPredicateExpr = resolveAttrs(spark, tableIdent.quotedString(), predicateExpr); - } else { - resolvedPredicateExpr = predicateExpr; - } - Seq predicates = - JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala() - .toSeq(); - - Seq partitions = - catalog.listPartitionsByFilter(tableIdent, predicates); - - return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() - .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) - .collect(Collectors.toList()); - } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Database not found in catalog.", tableIdent); - } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Table not found in catalog.", tableIdent); - } - } - - /** - * Returns the data files in a partition by listing the partition location. - * - *

For Parquet and ORC partitions, this will read metrics from the file footer. For Avro - * partitions, metrics are set to null. - * - * @param partition a partition - * @param conf a serializable Hadoop conf - * @param metricsConfig a metrics conf - * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, - * Configuration, MetricsConfig, NameMapping)} - */ - @Deprecated - public static List listPartition( - SparkPartition partition, - PartitionSpec spec, - SerializableConfiguration conf, - MetricsConfig metricsConfig) { - return listPartition(partition, spec, conf, metricsConfig, null); - } - - /** - * Returns the data files in a partition by listing the partition location. - * - *

For Parquet and ORC partitions, this will read metrics from the file footer. For Avro - * partitions, metrics are set to null. - * - * @param partition a partition - * @param conf a serializable Hadoop conf - * @param metricsConfig a metrics conf - * @param mapping a name mapping - * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, - * Configuration, MetricsConfig, NameMapping)} - */ - @Deprecated - public static List listPartition( - SparkPartition partition, - PartitionSpec spec, - SerializableConfiguration conf, - MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition( - partition.values, - partition.uri, - partition.format, - spec, - conf.get(), - metricsConfig, - mapping); - } - - private static SparkPartition toSparkPartition( - CatalogTablePartition partition, CatalogTable table) { - Option locationUri = partition.storage().locationUri(); - Option serde = partition.storage().serde(); - - Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument( - serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); - - String uri = Util.uriToString(locationUri.get()); - String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - - Map partitionSpec = - JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); - return new SparkPartition(partitionSpec, uri, format); - } - - private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { - Function2 resolver = spark.sessionState().analyzer().resolver(); - LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform( - new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = - plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); - } - - private static Iterator buildManifest( - SerializableConfiguration conf, - PartitionSpec spec, - String basePath, - Iterator> fileTuples) { - if (fileTuples.hasNext()) { - FileIO io = new HadoopFileIO(conf.get()); - TaskContext ctx = TaskContext.get(); - String suffix = - String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); - Path location = new Path(basePath, suffix); - String outputPath = FileFormat.AVRO.addExtension(location.toString()); - OutputFile outputFile = io.newOutputFile(outputPath); - ManifestWriter writer = ManifestFiles.write(spec, outputFile); - - try (ManifestWriter writerRef = writer) { - fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); - } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unable to close the manifest writer: %s", outputPath); - } - - ManifestFile manifestFile = writer.toManifestFile(); - return ImmutableList.of(manifestFile).iterator(); - } else { - return Collections.emptyIterator(); - } - } - - /** - * Import files from an existing Spark table to an Iceberg table. - * - *

The import uses the Spark session to get table metadata. It assumes no operation is going on - * the original and target table and thus is not thread-safe. - * - * @param spark a Spark session - * @param sourceTableIdent an identifier of the source Spark table - * @param targetTable an Iceberg table where to import the data - * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be - * partially defined - * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file - */ - public static void importSparkTable( - SparkSession spark, - TableIdentifier sourceTableIdent, - Table targetTable, - String stagingDir, - Map partitionFilter, - boolean checkDuplicateFiles) { - SessionCatalog catalog = spark.sessionState().catalog(); - - String db = - sourceTableIdent.database().nonEmpty() - ? sourceTableIdent.database().get() - : catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = - new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); - - if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException( - "Table %s does not exist", sourceTableIdentWithDB); - } - - try { - PartitionSpec spec = - SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); - - if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable( - spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); - } else { - List sourceTablePartitions = - getPartitions(spark, sourceTableIdent, partitionFilter); - Preconditions.checkArgument( - !sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", - sourceTableIdent); - importSparkPartitions( - spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); - } - } catch (AnalysisException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unable to get partition spec for table: %s", sourceTableIdentWithDB); - } - } - - /** - * Import files from an existing Spark table to an Iceberg table. - * - *

The import uses the Spark session to get table metadata. It assumes no operation is going on - * the original and target table and thus is not thread-safe. - * - * @param spark a Spark session - * @param sourceTableIdent an identifier of the source Spark table - * @param targetTable an Iceberg table where to import the data - * @param stagingDir a staging directory to store temporary manifest files - * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file - */ - public static void importSparkTable( - SparkSession spark, - TableIdentifier sourceTableIdent, - Table targetTable, - String stagingDir, - boolean checkDuplicateFiles) { - importSparkTable( - spark, - sourceTableIdent, - targetTable, - stagingDir, - Collections.emptyMap(), - checkDuplicateFiles); - } - - /** - * Import files from an existing Spark table to an Iceberg table. - * - *

The import uses the Spark session to get table metadata. It assumes no operation is going on - * the original and target table and thus is not thread-safe. - * - * @param spark a Spark session - * @param sourceTableIdent an identifier of the source Spark table - * @param targetTable an Iceberg table where to import the data - * @param stagingDir a staging directory to store temporary manifest files - */ - public static void importSparkTable( - SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { - importSparkTable( - spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); - } - - private static void importUnpartitionedSparkTable( - SparkSession spark, - TableIdentifier sourceTableIdent, - Table targetTable, - boolean checkDuplicateFiles) { - try { - CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); - Option format = - sourceTable.storage().serde().nonEmpty() - ? sourceTable.storage().serde() - : sourceTable.provider(); - Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); - - Map partition = Collections.emptyMap(); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Configuration conf = spark.sessionState().newHadoopConf(); - MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); - String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = - nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = - TableMigrationUtil.listPartition( - partition, - Util.uriToString(sourceTable.location()), - format.get(), - spec, - conf, - metricsConfig, - nameMapping); - - if (checkDuplicateFiles) { - Dataset importedFiles = - spark - .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) - .toDF("file_path"); - Dataset existingFiles = - loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = - existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = - importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); - Preconditions.checkState( - duplicates.isEmpty(), - String.format( - duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); - } - - AppendFiles append = targetTable.newAppend(); - files.forEach(append::appendFile); - append.commit(); - } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Database not found in catalog.", sourceTableIdent); - } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException( - e, "Unknown table: %s. Table not found in catalog.", sourceTableIdent); - } - } - - /** - * Import files from given partitions to an Iceberg table. - * - * @param spark a Spark session - * @param partitions partitions to import - * @param targetTable an Iceberg table where to import the data - * @param spec a partition spec - * @param stagingDir a staging directory to store temporary manifest files - * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file - */ - public static void importSparkPartitions( - SparkSession spark, - List partitions, - Table targetTable, - PartitionSpec spec, - String stagingDir, - boolean checkDuplicateFiles) { - Configuration conf = spark.sessionState().newHadoopConf(); - SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = - Math.min( - partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); - int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); - MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); - String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = - nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - - Dataset partitionDS = - spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - - Dataset filesToImport = - partitionDS.flatMap( - (FlatMapFunction) - sparkPartition -> - listPartition( - sparkPartition, spec, serializableConf, metricsConfig, nameMapping) - .iterator(), - Encoders.javaSerialization(DataFile.class)); - - if (checkDuplicateFiles) { - Dataset importedFiles = - filesToImport.map(f -> f.path().toString(), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = - existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = - importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); - Preconditions.checkState( - duplicates.isEmpty(), - String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); - } - - List manifests = - filesToImport - .repartition(numShufflePartitions) - .map( - (MapFunction>) - file -> Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) - fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); - - try { - boolean snapshotIdInheritanceEnabled = - PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); - - AppendFiles append = targetTable.newAppend(); - manifests.forEach(append::appendManifest); - append.commit(); - - if (!snapshotIdInheritanceEnabled) { - // delete original manifests as they were rewritten before the commit - deleteManifests(targetTable.io(), manifests); - } - } catch (Throwable e) { - deleteManifests(targetTable.io(), manifests); - throw e; - } - } - - /** - * Import files from given partitions to an Iceberg table. - * - * @param spark a Spark session - * @param partitions partitions to import - * @param targetTable an Iceberg table where to import the data - * @param spec a partition spec - * @param stagingDir a staging directory to store temporary manifest files - */ - public static void importSparkPartitions( - SparkSession spark, - List partitions, - Table targetTable, - PartitionSpec spec, - String stagingDir) { - importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); - } - - public static List filterPartitions( - List partitions, Map partitionFilter) { - if (partitionFilter.isEmpty()) { - return partitions; - } else { - return partitions.stream() - .filter(p -> p.getValues().entrySet().containsAll(partitionFilter.entrySet())) - .collect(Collectors.toList()); - } - } - - private static void deleteManifests(FileIO io, List manifests) { - Tasks.foreach(manifests) - .noRetry() - .suppressFailureWhenFinished() - .run(item -> io.deleteFile(item.path())); - } - - // Attempt to use Spark3 Catalog resolution if available on the path - private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = - DynMethods.builder("loadMetadataTable") - .hiddenImpl( - "org.apache.iceberg.spark.Spark3Util", - SparkSession.class, - Table.class, - MetadataTableType.class) - .orNoop() - .build(); - - public static Dataset loadCatalogMetadataTable( - SparkSession spark, Table table, MetadataTableType type) { - Preconditions.checkArgument( - !LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); - return LOAD_METADATA_TABLE.asStatic().invoke(spark, table, type); - } - - public static Dataset loadMetadataTable( - SparkSession spark, Table table, MetadataTableType type) { - if (spark.version().startsWith("3")) { - // construct the metadata table instance directly - Dataset catalogMetadataTable = loadCatalogMetadataTable(spark, table, type); - if (catalogMetadataTable != null) { - return catalogMetadataTable; - } - } - - String tableName = table.name(); - String tableLocation = table.location(); - - DataFrameReader dataFrameReader = spark.read().format("iceberg"); - if (tableName.contains("/")) { - // Hadoop Table or Metadata location passed, load without a catalog - return dataFrameReader.load(tableName + "#" + type); - } - - // Catalog based resolution failed, our catalog may be a non-DatasourceV2 Catalog - if (tableName.startsWith("hadoop.")) { - // Try loading by location as Hadoop table without Catalog - return dataFrameReader.load(tableLocation + "#" + type); - } else if (tableName.startsWith("hive")) { - // Try loading by name as a Hive table without Catalog - return dataFrameReader.load(tableName.replaceFirst("hive\\.", "") + "." + type); - } else { - throw new IllegalArgumentException( - String.format("Cannot find the metadata table for %s of type %s", tableName, type)); - } - } - - /** Class representing a table partition. */ - public static class SparkPartition implements Serializable { - private final Map values; - private final String uri; - private final String format; - - public SparkPartition(Map values, String uri, String format) { - this.values = Maps.newHashMap(values); - this.uri = uri; - this.format = format; - } - - public Map getValues() { - return values; - } - - public String getUri() { - return uri; - } - - public String getFormat() { - return format; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("values", values) - .add("uri", uri) - .add("format", format) - .toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) - && Objects.equal(uri, that.uri) - && Objects.equal(format, that.format); - } - - @Override - public int hashCode() { - return Objects.hashCode(values, uri, format); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java deleted file mode 100644 index 17499736fbeb..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.BooleanType; -import org.apache.spark.sql.types.ByteType; -import org.apache.spark.sql.types.CharType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DateType; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.DoubleType; -import org.apache.spark.sql.types.FloatType; -import org.apache.spark.sql.types.IntegerType; -import org.apache.spark.sql.types.LongType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.ShortType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.TimestampType; -import org.apache.spark.sql.types.VarcharType; - -class SparkTypeToType extends SparkTypeVisitor { - private final StructType root; - private int nextId = 0; - - SparkTypeToType() { - this.root = null; - } - - SparkTypeToType(StructType root) { - this.root = root; - // the root struct's fields use the first ids - this.nextId = root.fields().length; - } - - private int getNextId() { - int next = nextId; - nextId += 1; - return next; - } - - @Override - @SuppressWarnings("ReferenceEquality") - public Type struct(StructType struct, List types) { - StructField[] fields = struct.fields(); - List newFields = Lists.newArrayListWithExpectedSize(fields.length); - boolean isRoot = root == struct; - for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; - Type type = types.get(i); - - int id; - if (isRoot) { - // for new conversions, use ordinals for ids in the root struct - id = i; - } else { - id = getNextId(); - } - - String doc = field.getComment().isDefined() ? field.getComment().get() : null; - - if (field.nullable()) { - newFields.add(Types.NestedField.optional(id, field.name(), type, doc)); - } else { - newFields.add(Types.NestedField.required(id, field.name(), type, doc)); - } - } - - return Types.StructType.of(newFields); - } - - @Override - public Type field(StructField field, Type typeResult) { - return typeResult; - } - - @Override - public Type array(ArrayType array, Type elementType) { - if (array.containsNull()) { - return Types.ListType.ofOptional(getNextId(), elementType); - } else { - return Types.ListType.ofRequired(getNextId(), elementType); - } - } - - @Override - public Type map(MapType map, Type keyType, Type valueType) { - if (map.valueContainsNull()) { - return Types.MapType.ofOptional(getNextId(), getNextId(), keyType, valueType); - } else { - return Types.MapType.ofRequired(getNextId(), getNextId(), keyType, valueType); - } - } - - @SuppressWarnings("checkstyle:CyclomaticComplexity") - @Override - public Type atomic(DataType atomic) { - if (atomic instanceof BooleanType) { - return Types.BooleanType.get(); - - } else if (atomic instanceof IntegerType - || atomic instanceof ShortType - || atomic instanceof ByteType) { - return Types.IntegerType.get(); - - } else if (atomic instanceof LongType) { - return Types.LongType.get(); - - } else if (atomic instanceof FloatType) { - return Types.FloatType.get(); - - } else if (atomic instanceof DoubleType) { - return Types.DoubleType.get(); - - } else if (atomic instanceof StringType - || atomic instanceof CharType - || atomic instanceof VarcharType) { - return Types.StringType.get(); - - } else if (atomic instanceof DateType) { - return Types.DateType.get(); - - } else if (atomic instanceof TimestampType) { - return Types.TimestampType.withZone(); - - } else if (atomic instanceof DecimalType) { - return Types.DecimalType.of( - ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); - } else if (atomic instanceof BinaryType) { - return Types.BinaryType.get(); - } - - throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java deleted file mode 100644 index 1ef694263fa4..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.UserDefinedType; - -class SparkTypeVisitor { - static T visit(DataType type, SparkTypeVisitor visitor) { - if (type instanceof StructType) { - StructField[] fields = ((StructType) type).fields(); - List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); - - for (StructField field : fields) { - fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); - } - - return visitor.struct((StructType) type, fieldResults); - - } else if (type instanceof MapType) { - return visitor.map( - (MapType) type, - visit(((MapType) type).keyType(), visitor), - visit(((MapType) type).valueType(), visitor)); - - } else if (type instanceof ArrayType) { - return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); - - } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException("User-defined types are not supported"); - - } else { - return visitor.atomic(type); - } - } - - public T struct(StructType struct, List fieldResults) { - return null; - } - - public T field(StructField field, T typeResult) { - return null; - } - - public T array(ArrayType array, T elementResult) { - return null; - } - - public T map(MapType map, T keyResult, T valueResult) { - return null; - } - - public T atomic(DataType atomic) { - return null; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java deleted file mode 100644 index 2cdec2b0629c..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopConfigurable; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.transforms.Transform; -import org.apache.iceberg.transforms.UnknownTransform; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.spark.sql.RuntimeConfig; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.util.SerializableConfiguration; - -public class SparkUtil { - - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = - String.format( - "Cannot handle timestamp without" - + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" - + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" - + " but will change their displayed values in Spark. For more information please see" - + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" - + "-spark-sql-timestamps", - SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - - private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration - // values - // for Iceberg tables from a given catalog. These keys can be specified as - // `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark - // session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = - SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - - private SparkUtil() {} - - public static FileIO serializableFileIO(Table table) { - if (table.io() instanceof HadoopConfigurable) { - // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()) - .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); - } - - return table.io(); - } - - /** - * Check whether the partition transforms in a spec can be used to write data. - * - * @param spec a PartitionSpec - * @throws UnsupportedOperationException if the spec contains unknown partition transforms - */ - public static void validatePartitionTransforms(PartitionSpec spec) { - if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = - spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); - - throw new UnsupportedOperationException( - String.format("Cannot write using unsupported transforms: %s", unsupported)); - } - } - - /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the - * catalog and identifier a multipart identifier represents - * - * @param nameParts Multipart identifier representing a table - * @return The CatalogPlugin and Identifier for the table - */ - public static Pair catalogAndIdentifier( - List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument( - !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); - - int lastElementIndex = nameParts.size() - 1; - String name = nameParts.get(lastElementIndex); - - if (nameParts.size() == 1) { - // Only a single element, use current catalog and namespace - return Pair.of(currentCatalog, identiferProvider.apply(currentNamespace, name)); - } else { - C catalog = catalogProvider.apply(nameParts.get(0)); - if (catalog == null) { - // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); - return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); - } else { - // Assume the first element is a valid catalog - String[] namespace = nameParts.subList(1, lastElementIndex).toArray(new String[0]); - return Pair.of(catalog, identiferProvider.apply(namespace, name)); - } - } - } - - /** - * Responsible for checking if the table schema has a timestamp without timezone column - * - * @param schema table schema to check if it contains a timestamp without timezone column - * @return boolean indicating if the schema passed in has a timestamp field without a timezone - */ - public static boolean hasTimestampWithoutZone(Schema schema) { - return TypeUtil.find(schema, t -> Types.TimestampType.withoutZone().equals(t)) != null; - } - - /** - * Checks whether timestamp types for new tables should be stored with timezone info. - * - *

The default value is false and all timestamp fields are stored as {@link - * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored - * as {@link Types.TimestampType#withoutZone()}. - * - * @param sessionConf a Spark runtime config - * @return true if timestamp types for new tables should be stored with timezone info - */ - public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = - sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); - if (sessionConfValue != null) { - return Boolean.parseBoolean(sessionConfValue); - } - return SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES_DEFAULT; - } - - /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which - * can be set via `spark.sql.catalog.$catalogName.hadoop.*` - * - *

Mirrors the override of hadoop configurations for a given spark session using - * `spark.hadoop.*`. - * - *

The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting - * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() - * - * @param spark The current Spark session - * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific - * overrides applied. - */ - public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { - // Find keys for the catalog intended to be hadoop configurations - final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); - final Configuration conf = spark.sessionState().newHadoopConf(); - spark - .sqlContext() - .conf() - .settings() - .forEach( - (k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, - // which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); - return conf; - } - - private static String hadoopConfPrefixForCatalog(String catalogName) { - return String.format(SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR, catalogName); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java deleted file mode 100644 index 741ef00619ea..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.nio.ByteBuffer; -import java.sql.Date; -import java.sql.Timestamp; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.util.DateTimeUtils; - -/** A utility class that converts Spark values to Iceberg's internal representation. */ -public class SparkValueConverter { - - private SparkValueConverter() {} - - public static Record convert(Schema schema, Row row) { - return convert(schema.asStruct(), row); - } - - public static Object convert(Type type, Object object) { - if (object == null) { - return null; - } - - switch (type.typeId()) { - case STRUCT: - return convert(type.asStructType(), (Row) object); - - case LIST: - List convertedList = Lists.newArrayList(); - List list = (List) object; - try { - for (Object element : list) { - convertedList.add(convert(type.asListType().elementType(), element)); - } - return convertedList; - } catch (NullPointerException npe) { - // Scala 2.11 fix: Catch NPE as internal value could be null and scala wrapper does not - // evaluate until iteration. - return null; - } - - case MAP: - Map convertedMap = Maps.newLinkedHashMap(); - Map map = (Map) object; - try { - for (Map.Entry entry : map.entrySet()) { - convertedMap.put( - convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue())); - } - return convertedMap; - } catch (NullPointerException npe) { - // Scala 2.11 fix: Catch NPE as internal value could be null and scala wrapper does not - // evaluate until iteration. - return null; - } - - case DATE: - return DateTimeUtils.fromJavaDate((Date) object); - case TIMESTAMP: - return DateTimeUtils.fromJavaTimestamp((Timestamp) object); - case BINARY: - return ByteBuffer.wrap((byte[]) object); - case INTEGER: - return ((Number) object).intValue(); - case BOOLEAN: - case LONG: - case FLOAT: - case DOUBLE: - case DECIMAL: - case STRING: - case FIXED: - return object; - default: - throw new UnsupportedOperationException("Not a supported type: " + type); - } - } - - private static Record convert(Types.StructType struct, Row row) { - if (row == null) { - return null; - } - - Record record = GenericRecord.create(struct); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - - Type fieldType = field.type(); - - switch (fieldType.typeId()) { - case STRUCT: - record.set(i, convert(fieldType.asStructType(), row.getStruct(i))); - break; - case LIST: - record.set(i, convert(fieldType.asListType(), row.getList(i))); - break; - case MAP: - record.set(i, convert(fieldType.asMapType(), row.getJavaMap(i))); - break; - default: - record.set(i, convert(fieldType, row.get(i))); - } - } - return record; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java deleted file mode 100644 index 52449ddd0317..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.Locale; -import java.util.Map; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.spark.sql.RuntimeConfig; -import org.apache.spark.sql.SparkSession; - -/** - * A class for common Iceberg configs for Spark writes. - * - *

If a config is set at multiple levels, the following order of precedence is used (top to - * bottom): - * - *

    - *
  1. Write options - *
  2. Session configuration - *
  3. Table metadata - *
- * - * The most specific value is set in write options and takes precedence over all other configs. If - * no write option is provided, this class checks the session configuration for any overrides. If no - * applicable value is found in the session configuration, this class uses the table metadata. - * - *

Note this class is NOT meant to be serialized and sent to executors. - */ -public class SparkWriteConf { - - private final RuntimeConfig sessionConf; - private final Map writeOptions; - private final SparkConfParser confParser; - - public SparkWriteConf(SparkSession spark, Table table, Map writeOptions) { - this.sessionConf = spark.conf(); - this.writeOptions = writeOptions; - this.confParser = new SparkConfParser(spark, table, writeOptions); - } - - public boolean checkNullability() { - return confParser - .booleanConf() - .option(SparkWriteOptions.CHECK_NULLABILITY) - .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) - .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) - .parse(); - } - - public boolean checkOrdering() { - return confParser - .booleanConf() - .option(SparkWriteOptions.CHECK_ORDERING) - .sessionConf(SparkSQLProperties.CHECK_ORDERING) - .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) - .parse(); - } - - /** - * Enables writing a timestamp with time zone as a timestamp without time zone. - * - *

Generally, this is not safe as a timestamp without time zone is supposed to represent the - * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so - * that the corresponding time in the reader timezone is displayed. - * - *

When set to false (default), an exception must be thrown if the table contains a timestamp - * without time zone. - * - * @return boolean indicating if writing timestamps without timezone is allowed - */ - public boolean handleTimestampWithoutZone() { - return confParser - .booleanConf() - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) - .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) - .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) - .parse(); - } - - public String overwriteMode() { - String overwriteMode = writeOptions.get("overwrite-mode"); - return overwriteMode != null ? overwriteMode.toLowerCase(Locale.ROOT) : null; - } - - public String wapId() { - return sessionConf.get("spark.wap.id", null); - } - - public FileFormat dataFileFormat() { - String valueAsString = - confParser - .stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); - return FileFormat.fromString(valueAsString); - } - - public long targetDataFileSize() { - return confParser - .longConf() - .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) - .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) - .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) - .parse(); - } - - public boolean fanoutWriterEnabled() { - return confParser - .booleanConf() - .option(SparkWriteOptions.FANOUT_ENABLED) - .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) - .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) - .parse(); - } - - public Map extraSnapshotMetadata() { - Map extraSnapshotMetadata = Maps.newHashMap(); - - writeOptions.forEach( - (key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put( - key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); - - return extraSnapshotMetadata; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java deleted file mode 100644 index 0ba435ae7429..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -/** Spark DF write options */ -public class SparkWriteOptions { - - private SparkWriteOptions() {} - - // Fileformat for write operations(default: Table write.format.default ) - public static final String WRITE_FORMAT = "write-format"; - - // Overrides this table's write.target-file-size-bytes - public static final String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes"; - - // Sets the nullable check on fields(default: true) - public static final String CHECK_NULLABILITY = "check-nullability"; - - // Adds an entry with custom-key and corresponding value in the snapshot summary - // ex: df.write().format(iceberg) - // .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX."key1", "value1") - // .save(location) - public static final String SNAPSHOT_PROPERTY_PREFIX = "snapshot-property"; - - // Overrides table property write.spark.fanout.enabled(default: false) - public static final String FANOUT_ENABLED = "fanout-enabled"; - - // Checks if input schema and table schema are same(default: true) - public static final String CHECK_ORDERING = "check-ordering"; - - // File scan task set ID that indicates which files must be replaced - public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; - - // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = - "handle-timestamp-without-timezone"; -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java deleted file mode 100644 index 1e4b0f2f4e3d..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.ArrayType$; -import org.apache.spark.sql.types.BinaryType$; -import org.apache.spark.sql.types.BooleanType$; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DateType$; -import org.apache.spark.sql.types.DecimalType$; -import org.apache.spark.sql.types.DoubleType$; -import org.apache.spark.sql.types.FloatType$; -import org.apache.spark.sql.types.IntegerType$; -import org.apache.spark.sql.types.LongType$; -import org.apache.spark.sql.types.MapType$; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StringType$; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType$; -import org.apache.spark.sql.types.TimestampType$; - -class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() {} - - @Override - public DataType schema(Schema schema, DataType structType) { - return structType; - } - - @Override - public DataType struct(Types.StructType struct, List fieldResults) { - List fields = struct.fields(); - - List sparkFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - DataType type = fieldResults.get(i); - StructField sparkField = - StructField.apply(field.name(), type, field.isOptional(), Metadata.empty()); - if (field.doc() != null) { - sparkField = sparkField.withComment(field.doc()); - } - sparkFields.add(sparkField); - } - - return StructType$.MODULE$.apply(sparkFields); - } - - @Override - public DataType field(Types.NestedField field, DataType fieldResult) { - return fieldResult; - } - - @Override - public DataType list(Types.ListType list, DataType elementResult) { - return ArrayType$.MODULE$.apply(elementResult, list.isElementOptional()); - } - - @Override - public DataType map(Types.MapType map, DataType keyResult, DataType valueResult) { - return MapType$.MODULE$.apply(keyResult, valueResult, map.isValueOptional()); - } - - @Override - public DataType primitive(Type.PrimitiveType primitive) { - switch (primitive.typeId()) { - case BOOLEAN: - return BooleanType$.MODULE$; - case INTEGER: - return IntegerType$.MODULE$; - case LONG: - return LongType$.MODULE$; - case FLOAT: - return FloatType$.MODULE$; - case DOUBLE: - return DoubleType$.MODULE$; - case DATE: - return DateType$.MODULE$; - case TIME: - throw new UnsupportedOperationException("Spark does not support time fields"); - case TIMESTAMP: - return TimestampType$.MODULE$; - case STRING: - return StringType$.MODULE$; - case UUID: - // use String - return StringType$.MODULE$; - case FIXED: - return BinaryType$.MODULE$; - case BINARY: - return BinaryType$.MODULE$; - case DECIMAL: - Types.DecimalType decimal = (Types.DecimalType) primitive; - return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); - default: - throw new UnsupportedOperationException( - "Cannot convert unknown type to Spark: " + primitive); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java deleted file mode 100644 index a79f075ef442..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.function.Consumer; -import java.util.function.Predicate; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.BaseDeleteOrphanFilesActionResult; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HiddenPathFilter; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.JobGroupInfo; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.expressions.UserDefinedFunction; -import org.apache.spark.sql.functions; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.util.SerializableConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An action that removes orphan metadata and data files by listing a given location and comparing - * the actual files in that location with data and metadata files referenced by all valid snapshots. - * The location must be accessible for listing via the Hadoop {@link FileSystem}. - * - *

By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can - * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link - * #olderThan(long)}. For example, someone might point this action to the data folder to clean up - * only orphan data files. In addition, there is a way to configure an alternative delete method via - * {@link #deleteWith(Consumer)}. - * - *

Note: It is dangerous to call this action with a short retention interval as it might - * corrupt the state of the table if another operation is writing at the same time. - */ -public class BaseDeleteOrphanFilesSparkAction - extends BaseSparkAction - implements DeleteOrphanFiles { - - private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteOrphanFilesSparkAction.class); - private static final UserDefinedFunction filenameUDF = - functions.udf( - (String path) -> { - int lastIndex = path.lastIndexOf(File.separator); - if (lastIndex == -1) { - return path; - } else { - return path.substring(lastIndex + 1); - } - }, - DataTypes.StringType); - - private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; - - private final SerializableConfiguration hadoopConf; - private final int partitionDiscoveryParallelism; - private final Table table; - - private String location = null; - private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); - private Consumer deleteFunc = - new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; - - private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; - - public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { - super(spark); - - this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = - spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); - this.table = table; - this.location = table.location(); - - ValidationException.check( - PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), - "Cannot remove orphan files: GC is disabled (deleting files may corrupt other tables)"); - } - - @Override - protected DeleteOrphanFiles self() { - return this; - } - - @Override - public BaseDeleteOrphanFilesSparkAction executeDeleteWith(ExecutorService executorService) { - this.deleteExecutorService = executorService; - return this; - } - - @Override - public BaseDeleteOrphanFilesSparkAction location(String newLocation) { - this.location = newLocation; - return this; - } - - @Override - public BaseDeleteOrphanFilesSparkAction olderThan(long newOlderThanTimestamp) { - this.olderThanTimestamp = newOlderThanTimestamp; - return this; - } - - @Override - public BaseDeleteOrphanFilesSparkAction deleteWith(Consumer newDeleteFunc) { - this.deleteFunc = newDeleteFunc; - return this; - } - - @Override - public DeleteOrphanFiles.Result execute() { - JobGroupInfo info = newJobGroupInfo("REMOVE-ORPHAN-FILES", jobDesc()); - return withJobGroupInfo(info, this::doExecute); - } - - private String jobDesc() { - List options = Lists.newArrayList(); - options.add("older_than=" + olderThanTimestamp); - if (location != null) { - options.add("location=" + location); - } - return String.format( - "Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); - } - - private DeleteOrphanFiles.Result doExecute() { - Dataset validDataFileDF = buildValidDataFileDF(table); - Dataset validMetadataFileDF = buildValidMetadataFileDF(table); - Dataset validFileDF = validDataFileDF.union(validMetadataFileDF); - Dataset actualFileDF = buildActualFileDF(); - - Column actualFileName = filenameUDF.apply(actualFileDF.col("file_path")); - Column validFileName = filenameUDF.apply(validFileDF.col("file_path")); - Column nameEqual = actualFileName.equalTo(validFileName); - Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); - Column joinCond = nameEqual.and(actualContains); - List orphanFiles = - actualFileDF.join(validFileDF, joinCond, "leftanti").as(Encoders.STRING()).collectAsList(); - - Tasks.foreach(orphanFiles) - .noRetry() - .executeWith(deleteExecutorService) - .suppressFailureWhenFinished() - .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)) - .run(deleteFunc::accept); - - return new BaseDeleteOrphanFilesActionResult(orphanFiles); - } - - private Dataset buildActualFileDF() { - List subDirs = Lists.newArrayList(); - List matchingFiles = Lists.newArrayList(); - - Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; - - // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver - listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles); - - JavaRDD matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); - - if (subDirs.isEmpty()) { - return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); - } - - int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism); - JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); - - Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = - subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); - - JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); - return spark() - .createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()) - .toDF("file_path"); - } - - private static void listDirRecursively( - String dir, - Predicate predicate, - Configuration conf, - int maxDepth, - int maxDirectSubDirs, - List remainingSubDirs, - List matchingFiles) { - - // stop listing whenever we reach the max depth - if (maxDepth <= 0) { - remainingSubDirs.add(dir); - return; - } - - try { - Path path = new Path(dir); - FileSystem fs = path.getFileSystem(conf); - - List subDirs = Lists.newArrayList(); - - for (FileStatus file : fs.listStatus(path, HiddenPathFilter.get())) { - if (file.isDirectory()) { - subDirs.add(file.getPath().toString()); - } else if (file.isFile() && predicate.test(file)) { - matchingFiles.add(file.getPath().toString()); - } - } - - // stop listing if the number of direct sub dirs is bigger than maxDirectSubDirs - if (subDirs.size() > maxDirectSubDirs) { - remainingSubDirs.addAll(subDirs); - return; - } - - for (String subDir : subDirs) { - listDirRecursively( - subDir, - predicate, - conf, - maxDepth - 1, - maxDirectSubDirs, - remainingSubDirs, - matchingFiles); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, long olderThanTimestamp) { - - return dirs -> { - List subDirs = Lists.newArrayList(); - List files = Lists.newArrayList(); - - Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; - - int maxDepth = 2000; - int maxDirectSubDirs = Integer.MAX_VALUE; - - dirs.forEachRemaining( - dir -> { - listDirRecursively( - dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); - }); - - if (!subDirs.isEmpty()) { - throw new RuntimeException( - "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); - } - - return files.iterator(); - }; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java deleted file mode 100644 index cba67d57ad14..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.atomic.AtomicLong; -import java.util.function.Consumer; -import org.apache.iceberg.ReachableFileUtil; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableMetadataParser; -import org.apache.iceberg.actions.BaseDeleteReachableFilesActionResult; -import org.apache.iceberg.actions.DeleteReachableFiles; -import org.apache.iceberg.exceptions.NotFoundException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.JobGroupInfo; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.functions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine - * which files should be deleted. - */ -@SuppressWarnings("UnnecessaryAnonymousClass") -public class BaseDeleteReachableFilesSparkAction - extends BaseSparkAction - implements DeleteReachableFiles { - private static final Logger LOG = - LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); - - private static final String DATA_FILE = "Data File"; - private static final String MANIFEST = "Manifest"; - private static final String MANIFEST_LIST = "Manifest List"; - private static final String OTHERS = "Others"; - - private static final String STREAM_RESULTS = "stream-results"; - - // Creates an executor service that runs each task in the thread that invokes execute/submit. - private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; - - private final TableMetadata tableMetadata; - - private final Consumer defaultDelete = - new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; - - private Consumer removeFunc = defaultDelete; - private ExecutorService removeExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; - private FileIO io = new HadoopFileIO(spark().sessionState().newHadoopConf()); - - public BaseDeleteReachableFilesSparkAction(SparkSession spark, String metadataLocation) { - super(spark); - this.tableMetadata = TableMetadataParser.read(io, metadataLocation); - ValidationException.check( - PropertyUtil.propertyAsBoolean(tableMetadata.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), - "Cannot remove files: GC is disabled (deleting files may corrupt other tables)"); - } - - @Override - protected DeleteReachableFiles self() { - return this; - } - - @Override - public DeleteReachableFiles io(FileIO fileIO) { - this.io = fileIO; - return this; - } - - @Override - public DeleteReachableFiles deleteWith(Consumer deleteFunc) { - this.removeFunc = deleteFunc; - return this; - } - - @Override - public DeleteReachableFiles executeDeleteWith(ExecutorService executorService) { - this.removeExecutorService = executorService; - return this; - } - - @Override - public Result execute() { - Preconditions.checkArgument(io != null, "File IO cannot be null"); - String msg = - String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); - JobGroupInfo info = newJobGroupInfo("REMOVE-FILES", msg); - return withJobGroupInfo(info, this::doExecute); - } - - private Result doExecute() { - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, false); - Dataset validFileDF = buildValidFileDF(tableMetadata).distinct(); - if (streamResults) { - return deleteFiles(validFileDF.toLocalIterator()); - } else { - return deleteFiles(validFileDF.collectAsList().iterator()); - } - } - - private Dataset projectFilePathWithType(Dataset ds, String type) { - return ds.select(functions.col("file_path"), functions.lit(type).as("file_type")); - } - - private Dataset buildValidFileDF(TableMetadata metadata) { - Table staticTable = newStaticTable(metadata, io); - return projectFilePathWithType(buildValidDataFileDF(staticTable), DATA_FILE) - .union(projectFilePathWithType(buildManifestFileDF(staticTable), MANIFEST)) - .union(projectFilePathWithType(buildManifestListDF(staticTable), MANIFEST_LIST)) - .union(projectFilePathWithType(buildOtherMetadataFileDF(staticTable), OTHERS)); - } - - @Override - protected Dataset buildOtherMetadataFileDF(Table table) { - List otherMetadataFiles = Lists.newArrayList(); - otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, true)); - otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table)); - otherMetadataFiles.addAll(ReachableFileUtil.statisticsFilesLocations(table)); - return spark().createDataset(otherMetadataFiles, Encoders.STRING()).toDF("file_path"); - } - - /** - * Deletes files passed to it. - * - * @param deleted an Iterator of Spark Rows of the structure (path: String, type: String) - * @return Statistics on which files were deleted - */ - private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) { - AtomicLong dataFileCount = new AtomicLong(0L); - AtomicLong manifestCount = new AtomicLong(0L); - AtomicLong manifestListCount = new AtomicLong(0L); - AtomicLong otherFilesCount = new AtomicLong(0L); - - Tasks.foreach(deleted) - .retry(3) - .stopRetryOn(NotFoundException.class) - .suppressFailureWhenFinished() - .executeWith(removeExecutorService) - .onFailure( - (fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run( - fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - removeFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = - dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); - LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult( - dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java deleted file mode 100644 index 9a04a6425bb9..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.atomic.AtomicLong; -import java.util.function.Consumer; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.actions.ExpireSnapshots; -import org.apache.iceberg.actions.ImmutableExpireSnapshots; -import org.apache.iceberg.exceptions.NotFoundException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.base.Joiner; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.JobGroupInfo; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.functions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses - * Spark to determine the delta in files between the pre and post-expiration table metadata. All of - * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - * - *

This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and - * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining - * two Datasets that contain all manifest and data files before and after the expiration. The - * snapshot expiration will be fully committed before any deletes are issued. - * - *

This operation performs a shuffle so the parallelism can be controlled through - * 'spark.sql.shuffle.partitions'. - * - *

Deletes are still performed locally after retrieving the results from the Spark executors. - */ -@SuppressWarnings("UnnecessaryAnonymousClass") -public class BaseExpireSnapshotsSparkAction - extends BaseSparkAction implements ExpireSnapshots { - private static final Logger LOG = LoggerFactory.getLogger(BaseExpireSnapshotsSparkAction.class); - - public static final String STREAM_RESULTS = "stream-results"; - - private static final String DATA_FILE = "Data File"; - private static final String MANIFEST = "Manifest"; - private static final String MANIFEST_LIST = "Manifest List"; - - // Creates an executor service that runs each task in the thread that invokes execute/submit. - private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; - - private final Table table; - private final TableOperations ops; - private final Consumer defaultDelete = - new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; - - private final Set expiredSnapshotIds = Sets.newHashSet(); - private Long expireOlderThanValue = null; - private Integer retainLastValue = null; - private Consumer deleteFunc = defaultDelete; - private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; - private Dataset expiredFiles = null; - - public BaseExpireSnapshotsSparkAction(SparkSession spark, Table table) { - super(spark); - this.table = table; - this.ops = ((HasTableOperations) table).operations(); - - ValidationException.check( - PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), - "Cannot expire snapshots: GC is disabled (deleting files may corrupt other tables)"); - } - - @Override - protected ExpireSnapshots self() { - return this; - } - - @Override - public BaseExpireSnapshotsSparkAction executeDeleteWith(ExecutorService executorService) { - this.deleteExecutorService = executorService; - return this; - } - - @Override - public BaseExpireSnapshotsSparkAction expireSnapshotId(long snapshotId) { - expiredSnapshotIds.add(snapshotId); - return this; - } - - @Override - public BaseExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { - this.expireOlderThanValue = timestampMillis; - return this; - } - - @Override - public BaseExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument( - 1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", - numSnapshots); - this.retainLastValue = numSnapshots; - return this; - } - - @Override - public BaseExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) { - this.deleteFunc = newDeleteFunc; - return this; - } - - /** - * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - * - *

This does not delete data files. To delete data files, run {@link #execute()}. - * - *

This may be called before or after {@link #execute()} is called to return the expired file - * list. - * - * @return a Dataset of files that are no longer referenced by the table - */ - public Dataset expire() { - if (expiredFiles == null) { - // fetch metadata before expiration - Dataset originalFiles = buildValidFileDF(ops.current()); - - // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = - table.expireSnapshots().cleanExpiredFiles(false); - for (long id : expiredSnapshotIds) { - expireSnapshots = expireSnapshots.expireSnapshotId(id); - } - - if (expireOlderThanValue != null) { - expireSnapshots = expireSnapshots.expireOlderThan(expireOlderThanValue); - } - - if (retainLastValue != null) { - expireSnapshots = expireSnapshots.retainLast(retainLastValue); - } - - expireSnapshots.commit(); - - // fetch metadata after expiration - Dataset validFiles = buildValidFileDF(ops.refresh()); - - // determine expired files - this.expiredFiles = originalFiles.except(validFiles); - } - - return expiredFiles; - } - - @Override - public ExpireSnapshots.Result execute() { - JobGroupInfo info = newJobGroupInfo("EXPIRE-SNAPSHOTS", jobDesc()); - return withJobGroupInfo(info, this::doExecute); - } - - private String jobDesc() { - List options = Lists.newArrayList(); - - if (expireOlderThanValue != null) { - options.add("older_than=" + expireOlderThanValue); - } - - if (retainLastValue != null) { - options.add("retain_last=" + retainLastValue); - } - - if (!expiredSnapshotIds.isEmpty()) { - Long first = expiredSnapshotIds.stream().findFirst().get(); - if (expiredSnapshotIds.size() > 1) { - options.add( - String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); - } else { - options.add(String.format("snapshot_id: %s", first)); - } - } - - return String.format( - "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); - } - - private ExpireSnapshots.Result doExecute() { - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, false); - if (streamResults) { - return deleteFiles(expire().toLocalIterator()); - } else { - return deleteFiles(expire().collectAsList().iterator()); - } - } - - private Dataset appendTypeString(Dataset ds, String type) { - return ds.select(new Column("file_path"), functions.lit(type).as("file_type")); - } - - private Dataset buildValidFileDF(TableMetadata metadata) { - Table staticTable = newStaticTable(metadata, this.table.io()); - return appendTypeString(buildValidDataFileDF(staticTable), DATA_FILE) - .union(appendTypeString(buildManifestFileDF(staticTable), MANIFEST)) - .union(appendTypeString(buildManifestListDF(staticTable), MANIFEST_LIST)); - } - - /** - * Deletes files passed to it based on their type. - * - * @param expired an Iterator of Spark Rows of the structure (path: String, type: String) - * @return Statistics on which files were deleted - */ - private ExpireSnapshots.Result deleteFiles(Iterator expired) { - AtomicLong dataFileCount = new AtomicLong(0L); - AtomicLong manifestCount = new AtomicLong(0L); - AtomicLong manifestListCount = new AtomicLong(0L); - - Tasks.foreach(expired) - .retry(3) - .stopRetryOn(NotFoundException.class) - .suppressFailureWhenFinished() - .executeWith(deleteExecutorService) - .onFailure( - (fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run( - fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - } - }); - - LOG.info( - "Deleted {} total files", - dataFileCount.get() + manifestCount.get() + manifestListCount.get()); - return ImmutableExpireSnapshots.Result.builder() - .deletedDataFilesCount(dataFileCount.get()) - .deletedManifestsCount(manifestCount.get()) - .deletedManifestListsCount(manifestListCount.get()) - .deletedPositionDeleteFilesCount(0L) - .deletedEqualityDeleteFilesCount(0L) - .build(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java deleted file mode 100644 index cc44b7027915..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.MetadataTableType.ENTRIES; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.UUID; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.BaseRewriteManifestsActionResult; -import org.apache.iceberg.actions.RewriteManifests; -import org.apache.iceberg.exceptions.CommitStateUnknownException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.JobGroupInfo; -import org.apache.iceberg.spark.SparkDataFile; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapPartitionsFunction; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.internal.SQLConf; -import org.apache.spark.sql.types.StructType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - * - *

By default, this action rewrites all manifests for the current partition spec and writes the - * result to the metadata folder. The behavior can be modified by passing a custom predicate to - * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is - * a way to configure a custom location for new manifests via {@link #stagingLocation}. - */ -public class BaseRewriteManifestsSparkAction - extends BaseSnapshotUpdateSparkAction - implements RewriteManifests { - - private static final Logger LOG = LoggerFactory.getLogger(BaseRewriteManifestsSparkAction.class); - - private static final String USE_CACHING = "use-caching"; - private static final boolean USE_CACHING_DEFAULT = true; - - private final Encoder manifestEncoder; - private final Table table; - private final int formatVersion; - private final FileIO fileIO; - private final long targetManifestSizeBytes; - - private PartitionSpec spec = null; - private Predicate predicate = manifest -> true; - private String stagingLocation = null; - - public BaseRewriteManifestsSparkAction(SparkSession spark, Table table) { - super(spark); - this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); - this.table = table; - this.spec = table.spec(); - this.targetManifestSizeBytes = - PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); - this.fileIO = SparkUtil.serializableFileIO(table); - - // default the staging location to the metadata location - TableOperations ops = ((HasTableOperations) table).operations(); - Path metadataFilePath = new Path(ops.metadataFileLocation("file")); - this.stagingLocation = metadataFilePath.getParent().toString(); - - // use the current table format version for new manifests - this.formatVersion = ops.current().formatVersion(); - } - - @Override - protected RewriteManifests self() { - return this; - } - - @Override - public RewriteManifests specId(int specId) { - Preconditions.checkArgument(table.specs().containsKey(specId), "Invalid spec id %s", specId); - this.spec = table.specs().get(specId); - return this; - } - - @Override - public RewriteManifests rewriteIf(Predicate newPredicate) { - this.predicate = newPredicate; - return this; - } - - @Override - public RewriteManifests stagingLocation(String newStagingLocation) { - this.stagingLocation = newStagingLocation; - return this; - } - - @Override - public RewriteManifests.Result execute() { - String desc = - String.format( - "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); - JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); - return withJobGroupInfo(info, this::doExecute); - } - - private RewriteManifests.Result doExecute() { - List matchingManifests = findMatchingManifests(); - if (matchingManifests.isEmpty()) { - return BaseRewriteManifestsActionResult.empty(); - } - - long totalSizeBytes = 0L; - int numEntries = 0; - - for (ManifestFile manifest : matchingManifests) { - ValidationException.check( - hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); - - totalSizeBytes += manifest.length(); - numEntries += - manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); - } - - int targetNumManifests = targetNumManifests(totalSizeBytes); - int targetNumManifestEntries = targetNumManifestEntries(numEntries, targetNumManifests); - - if (targetNumManifests == 1 && matchingManifests.size() == 1) { - return BaseRewriteManifestsActionResult.empty(); - } - - Dataset manifestEntryDF = buildManifestEntryDF(matchingManifests); - - List newManifests; - if (spec.fields().size() < 1) { - newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); - } else { - newManifests = - writeManifestsForPartitionedTable( - manifestEntryDF, targetNumManifests, targetNumManifestEntries); - } - - replaceManifests(matchingManifests, newManifests); - - return new BaseRewriteManifestsActionResult(matchingManifests, newManifests); - } - - private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = - spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); - - Dataset manifestEntryDF = - loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr( - "input_file_name() as manifest", - "snapshot_id", - "sequence_number", - "file_sequence_number", - "data_file"); - - Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); - return manifestEntryDF - .join(manifestDF, joinCond, "left_semi") - .select("snapshot_id", "sequence_number", "file_sequence_number", "data_file"); - } - - private List writeManifestsForUnpartitionedTable( - Dataset manifestEntryDF, int numManifests) { - Broadcast io = sparkContext().broadcast(fileIO); - StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - - // we rely only on the target number of manifests for unpartitioned tables - // as we should not worry about having too much metadata per partition - long maxNumManifestEntries = Long.MAX_VALUE; - - return manifestEntryDF - .repartition(numManifests) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder) - .collectAsList(); - } - - private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { - - Broadcast io = sparkContext().broadcast(fileIO); - StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - - // we allow the actual size of manifests to be 10% higher if the estimation is not precise - // enough - long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - - return withReusableDS( - manifestEntryDF, - df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests( - io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder) - .collectAsList(); - }); - } - - private U withReusableDS(Dataset ds, Function, U> func) { - Dataset reusableDS; - boolean useCaching = - PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); - if (useCaching) { - reusableDS = ds.cache(); - } else { - int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = - ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); - } - - try { - return func.apply(reusableDS); - } finally { - if (useCaching) { - reusableDS.unpersist(false); - } - } - } - - private List findMatchingManifests() { - Snapshot currentSnapshot = table.currentSnapshot(); - - if (currentSnapshot == null) { - return ImmutableList.of(); - } - - return currentSnapshot.dataManifests(table.io()).stream() - .filter(manifest -> manifest.partitionSpecId() == spec.specId() && predicate.test(manifest)) - .collect(Collectors.toList()); - } - - private int targetNumManifests(long totalSizeBytes) { - return (int) ((totalSizeBytes + targetManifestSizeBytes - 1) / targetManifestSizeBytes); - } - - private int targetNumManifestEntries(int numEntries, int numManifests) { - return (numEntries + numManifests - 1) / numManifests; - } - - private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null - && manifest.existingFilesCount() != null - && manifest.deletedFilesCount() != null; - } - - private void replaceManifests( - Iterable deletedManifests, Iterable addedManifests) { - try { - boolean snapshotIdInheritanceEnabled = - PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); - - org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); - deletedManifests.forEach(rewriteManifests::deleteManifest); - addedManifests.forEach(rewriteManifests::addManifest); - commit(rewriteManifests); - - if (!snapshotIdInheritanceEnabled) { - // delete new manifests as they were rewritten before the commit - deleteFiles(Iterables.transform(addedManifests, ManifestFile::path)); - } - } catch (CommitStateUnknownException commitStateUnknownException) { - // don't clean up added manifest files, because they may have been successfully committed. - throw commitStateUnknownException; - } catch (Exception e) { - // delete all new manifests because the rewrite failed - deleteFiles(Iterables.transform(addedManifests, ManifestFile::path)); - throw e; - } - } - - private void deleteFiles(Iterable locations) { - Tasks.foreach(locations) - .noRetry() - .suppressFailureWhenFinished() - .onFailure((location, exc) -> LOG.warn("Failed to delete: {}", location, exc)) - .run(fileIO::deleteFile); - } - - private static ManifestFile writeManifest( - List rows, - int startIndex, - int endIndex, - Broadcast io, - String location, - int format, - PartitionSpec spec, - StructType sparkType) - throws IOException { - - String manifestName = "optimized-m-" + UUID.randomUUID(); - Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = - io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); - - Types.StructType dataFileType = DataFile.getType(spec.partitionType()); - SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); - - ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null); - - try { - for (int index = startIndex; index < endIndex; index++) { - Row row = rows.get(index); - long snapshotId = row.getLong(0); - long sequenceNumber = row.getLong(1); - Long fileSequenceNumber = row.isNullAt(2) ? null : row.getLong(2); - Row file = row.getStruct(3); - writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber, fileSequenceNumber); - } - } finally { - writer.close(); - } - - return writer.toManifestFile(); - } - - private static MapPartitionsFunction toManifests( - Broadcast io, - long maxNumManifestEntries, - String location, - int format, - PartitionSpec spec, - StructType sparkType) { - - return rows -> { - List rowsAsList = Lists.newArrayList(rows); - - if (rowsAsList.isEmpty()) { - return Collections.emptyIterator(); - } - - List manifests = Lists.newArrayList(); - if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add( - writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); - } else { - int midIndex = rowsAsList.size() / 2; - manifests.add( - writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add( - writeManifest( - rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); - } - - return manifests.iterator(); - }; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java deleted file mode 100644 index f68fb4e97e78..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import java.util.Map; -import org.apache.iceberg.actions.SnapshotUpdate; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.spark.sql.SparkSession; - -abstract class BaseSnapshotUpdateSparkAction extends BaseSparkAction - implements SnapshotUpdate { - - private final Map summary = Maps.newHashMap(); - - protected BaseSnapshotUpdateSparkAction(SparkSession spark) { - super(spark); - } - - @Override - public ThisT snapshotProperty(String property, String value) { - summary.put(property, value); - return self(); - } - - protected void commit(org.apache.iceberg.SnapshotUpdate update) { - summary.forEach(update::set); - update.commit(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java deleted file mode 100644 index c9e61b8c907f..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.ReachableFileUtil; -import org.apache.iceberg.StaticTableOperations; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.actions.Action; -import org.apache.iceberg.io.ClosingIterator; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.JobGroupInfo; -import org.apache.iceberg.spark.JobGroupUtils; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -abstract class BaseSparkAction implements Action { - - private static final AtomicInteger JOB_COUNTER = new AtomicInteger(); - - private final SparkSession spark; - private final JavaSparkContext sparkContext; - private final Map options = Maps.newHashMap(); - - protected BaseSparkAction(SparkSession spark) { - this.spark = spark; - this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } - - protected SparkSession spark() { - return spark; - } - - protected JavaSparkContext sparkContext() { - return sparkContext; - } - - protected abstract ThisT self(); - - @Override - public ThisT option(String name, String value) { - options.put(name, value); - return self(); - } - - @Override - public ThisT options(Map newOptions) { - options.putAll(newOptions); - return self(); - } - - protected Map options() { - return options; - } - - protected T withJobGroupInfo(JobGroupInfo info, Supplier supplier) { - SparkContext context = spark().sparkContext(); - JobGroupInfo previousInfo = JobGroupUtils.getJobGroupInfo(context); - try { - JobGroupUtils.setJobGroupInfo(context, info); - return supplier.get(); - } finally { - JobGroupUtils.setJobGroupInfo(context, previousInfo); - } - } - - protected JobGroupInfo newJobGroupInfo(String groupId, String desc) { - return new JobGroupInfo(groupId + "-" + JOB_COUNTER.incrementAndGet(), desc, false); - } - - protected Table newStaticTable(TableMetadata metadata, FileIO io) { - String metadataFileLocation = metadata.metadataFileLocation(); - StaticTableOperations ops = new StaticTableOperations(metadataFileLocation, io); - return new BaseTable(ops, metadataFileLocation); - } - - protected Dataset buildValidDataFileDF(Table table) { - JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); - - Dataset allManifests = - loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr( - "path", - "length", - "partition_spec_id as partitionSpecId", - "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition( - spark - .sessionState() - .conf() - .numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); - - return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); - } - - protected Dataset buildManifestFileDF(Table table) { - return loadMetadataTable(table, ALL_MANIFESTS).selectExpr("path as file_path"); - } - - protected Dataset buildManifestListDF(Table table) { - List manifestLists = ReachableFileUtil.manifestListLocations(table); - return spark.createDataset(manifestLists, Encoders.STRING()).toDF("file_path"); - } - - protected Dataset buildOtherMetadataFileDF(Table table) { - List otherMetadataFiles = Lists.newArrayList(); - otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, false)); - otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table)); - otherMetadataFiles.addAll(ReachableFileUtil.statisticsFilesLocations(table)); - return spark.createDataset(otherMetadataFiles, Encoders.STRING()).toDF("file_path"); - } - - protected Dataset buildValidMetadataFileDF(Table table) { - Dataset manifestDF = buildManifestFileDF(table); - Dataset manifestListDF = buildManifestListDF(table); - Dataset otherMetadataFileDF = buildOtherMetadataFileDF(table); - - return manifestDF.union(otherMetadataFileDF).union(manifestListDF); - } - - protected Dataset loadMetadataTable(Table table, MetadataTableType type) { - return SparkTableUtil.loadMetadataTable(spark, table, type); - } - - private static class ReadManifest implements FlatMapFunction { - private final Broadcast io; - - ReadManifest(Broadcast io) { - this.io = io; - } - - @Override - public Iterator call(ManifestFileBean manifest) { - return new ClosingIterator<>(ManifestFiles.readPaths(manifest, io.getValue()).iterator()); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java deleted file mode 100644 index bec51944f222..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import org.apache.iceberg.Table; -import org.apache.iceberg.actions.ActionsProvider; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.actions.DeleteReachableFiles; -import org.apache.iceberg.actions.ExpireSnapshots; -import org.apache.iceberg.actions.RewriteManifests; -import org.apache.spark.sql.SparkSession; - -abstract class BaseSparkActions implements ActionsProvider { - - private final SparkSession spark; - - protected BaseSparkActions(SparkSession spark) { - this.spark = spark; - } - - protected SparkSession spark() { - return spark; - } - - @Override - public DeleteOrphanFiles deleteOrphanFiles(Table table) { - return new BaseDeleteOrphanFilesSparkAction(spark, table); - } - - @Override - public RewriteManifests rewriteManifests(Table table) { - return new BaseRewriteManifestsSparkAction(spark, table); - } - - @Override - public ExpireSnapshots expireSnapshots(Table table) { - return new BaseExpireSnapshotsSparkAction(spark, table); - } - - @Override - public DeleteReachableFiles deleteReachableFiles(String metadataLocation) { - return new BaseDeleteReachableFilesSparkAction(spark, metadataLocation); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java deleted file mode 100644 index 3660b870c63f..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import java.nio.ByteBuffer; -import java.util.List; -import org.apache.iceberg.ManifestContent; -import org.apache.iceberg.ManifestFile; - -public class ManifestFileBean implements ManifestFile { - private String path = null; - private Long length = null; - private Integer partitionSpecId = null; - private Long addedSnapshotId = null; - - public String getPath() { - return path; - } - - public void setPath(String path) { - this.path = path; - } - - public Long getLength() { - return length; - } - - public void setLength(Long length) { - this.length = length; - } - - public Integer getPartitionSpecId() { - return partitionSpecId; - } - - public void setPartitionSpecId(Integer partitionSpecId) { - this.partitionSpecId = partitionSpecId; - } - - public Long getAddedSnapshotId() { - return addedSnapshotId; - } - - public void setAddedSnapshotId(Long addedSnapshotId) { - this.addedSnapshotId = addedSnapshotId; - } - - @Override - public String path() { - return path; - } - - @Override - public long length() { - return length; - } - - @Override - public int partitionSpecId() { - return partitionSpecId; - } - - @Override - public ManifestContent content() { - return ManifestContent.DATA; - } - - @Override - public long sequenceNumber() { - return 0; - } - - @Override - public long minSequenceNumber() { - return 0; - } - - @Override - public Long snapshotId() { - return addedSnapshotId; - } - - @Override - public Integer addedFilesCount() { - return null; - } - - @Override - public Long addedRowsCount() { - return null; - } - - @Override - public Integer existingFilesCount() { - return null; - } - - @Override - public Long existingRowsCount() { - return null; - } - - @Override - public Integer deletedFilesCount() { - return null; - } - - @Override - public Long deletedRowsCount() { - return null; - } - - @Override - public List partitions() { - return null; - } - - @Override - public ByteBuffer keyMetadata() { - return null; - } - - @Override - public ManifestFile copy() { - throw new UnsupportedOperationException("Cannot copy"); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java deleted file mode 100644 index 4b1ea37c2169..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import org.apache.iceberg.actions.ActionsProvider; -import org.apache.spark.sql.SparkSession; - -/** - * An implementation of {@link ActionsProvider} for Spark. - * - *

This class is the primary API for interacting with actions in Spark that users should use to - * instantiate particular actions. - */ -public class SparkActions extends BaseSparkActions { - - private SparkActions(SparkSession spark) { - super(spark); - } - - public static SparkActions get(SparkSession spark) { - return new SparkActions(spark); - } - - public static SparkActions get() { - return new SparkActions(SparkSession.active()); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java deleted file mode 100644 index 74454fc1e466..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.Pair; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -public abstract class AvroWithSparkSchemaVisitor - extends AvroWithPartnerByStructureVisitor { - - @Override - protected boolean isStringType(DataType dataType) { - return dataType instanceof StringType; - } - - @Override - protected boolean isMapType(DataType dataType) { - return dataType instanceof MapType; - } - - @Override - protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument( - arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); - return ((ArrayType) arrayType).elementType(); - } - - @Override - protected DataType mapKeyType(DataType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).keyType(); - } - - @Override - protected DataType mapValueType(DataType mapType) { - Preconditions.checkArgument(isMapType(mapType), "Invalid map: %s is not a map", mapType); - return ((MapType) mapType).valueType(); - } - - @Override - protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument( - structType instanceof StructType, "Invalid struct: %s is not a struct", structType); - StructField field = ((StructType) structType).apply(pos); - return Pair.of(field.name(), field.dataType()); - } - - @Override - protected DataType nullType() { - return DataTypes.NullType; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java deleted file mode 100644 index d74a76f94e87..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.util.Deque; -import java.util.List; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.apache.parquet.schema.Type.Repetition; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -/** - * Visitor for traversing a Parquet type with a companion Spark type. - * - * @param the Java class returned by the visitor - */ -public class ParquetWithSparkSchemaVisitor { - private final Deque fieldNames = Lists.newLinkedList(); - - public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { - Preconditions.checkArgument(sType != null, "Invalid DataType: null"); - if (type instanceof MessageType) { - Preconditions.checkArgument( - sType instanceof StructType, "Invalid struct: %s is not a struct", sType); - StructType struct = (StructType) sType; - return visitor.message( - struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); - - } else if (type.isPrimitive()) { - return visitor.primitive(sType, type.asPrimitiveType()); - - } else { - // if not a primitive, the typeId must be a group - GroupType group = type.asGroupType(); - OriginalType annotation = group.getOriginalType(); - if (annotation != null) { - switch (annotation) { - case LIST: - Preconditions.checkArgument( - !group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", - group); - - GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument( - repeatedElement.isRepetition(Repetition.REPEATED), - "Invalid list: inner group is not repeated"); - Preconditions.checkArgument( - repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", - group); - - Preconditions.checkArgument( - sType instanceof ArrayType, "Invalid list: %s is not an array", sType); - ArrayType array = (ArrayType) sType; - StructField element = - new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); - - visitor.fieldNames.push(repeatedElement.getName()); - try { - T elementResult = null; - if (repeatedElement.getFieldCount() > 0) { - elementResult = visitField(element, repeatedElement.getType(0), visitor); - } - - return visitor.list(array, group, elementResult); - - } finally { - visitor.fieldNames.pop(); - } - - case MAP: - Preconditions.checkArgument( - !group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", - group); - Preconditions.checkArgument( - group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", - group); - - GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument( - repeatedKeyValue.isRepetition(Repetition.REPEATED), - "Invalid map: inner group is not repeated"); - Preconditions.checkArgument( - repeatedKeyValue.getFieldCount() <= 2, - "Invalid map: repeated group does not have 2 fields"); - - Preconditions.checkArgument( - sType instanceof MapType, "Invalid map: %s is not a map", sType); - MapType map = (MapType) sType; - StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = - new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); - - visitor.fieldNames.push(repeatedKeyValue.getName()); - try { - T keyResult = null; - T valueResult = null; - switch (repeatedKeyValue.getFieldCount()) { - case 2: - // if there are 2 fields, both key and value are projected - keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor); - valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor); - break; - case 1: - // if there is just one, use the name to determine what it is - Type keyOrValue = repeatedKeyValue.getType(0); - if (keyOrValue.getName().equalsIgnoreCase("key")) { - keyResult = visitField(keyField, keyOrValue, visitor); - // value result remains null - } else { - valueResult = visitField(valueField, keyOrValue, visitor); - // key result remains null - } - break; - default: - // both results will remain null - } - - return visitor.map(map, group, keyResult, valueResult); - - } finally { - visitor.fieldNames.pop(); - } - - default: - } - } - - Preconditions.checkArgument( - sType instanceof StructType, "Invalid struct: %s is not a struct", sType); - StructType struct = (StructType) sType; - return visitor.struct(struct, group, visitFields(struct, group, visitor)); - } - } - - private static T visitField( - StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { - visitor.fieldNames.push(field.getName()); - try { - return visit(sField.dataType(), field, visitor); - } finally { - visitor.fieldNames.pop(); - } - } - - private static List visitFields( - StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { - StructField[] sFields = struct.fields(); - Preconditions.checkArgument( - sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); - List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.length; i += 1) { - Type field = group.getFields().get(i); - StructField sField = sFields[i]; - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", - field.getName(), - sField.name()); - results.add(visitField(sField, field, visitor)); - } - - return results; - } - - public T message(StructType sStruct, MessageType message, List fields) { - return null; - } - - public T struct(StructType sStruct, GroupType struct, List fields) { - return null; - } - - public T list(ArrayType sArray, GroupType array, T element) { - return null; - } - - public T map(MapType sMap, GroupType map, T key, T value) { - return null; - } - - public T primitive(DataType sPrimitive, PrimitiveType primitive) { - return null; - } - - protected String[] currentPath() { - return Lists.newArrayList(fieldNames.descendingIterator()).toArray(new String[0]); - } - - protected String[] path(String name) { - List list = Lists.newArrayList(fieldNames.descendingIterator()); - list.add(name); - return list.toArray(new String[0]); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java deleted file mode 100644 index 4622d2928ac4..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.Decoder; -import org.apache.iceberg.avro.AvroSchemaWithTypeVisitor; -import org.apache.iceberg.avro.SupportsRowPosition; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.data.avro.DecoderResolver; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; - -public class SparkAvroReader implements DatumReader, SupportsRowPosition { - - private final Schema readSchema; - private final ValueReader reader; - private Schema fileSchema = null; - - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema) { - this(expectedSchema, readSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public SparkAvroReader( - org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { - this.readSchema = readSchema; - this.reader = - (ValueReader) - AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); - } - - @Override - public void setSchema(Schema newFileSchema) { - this.fileSchema = Schema.applyAliases(newFileSchema, readSchema); - } - - @Override - public InternalRow read(InternalRow reuse, Decoder decoder) throws IOException { - return DecoderResolver.resolveAndRead(decoder, readSchema, fileSchema, reader, reuse); - } - - @Override - public void setRowPositionSupplier(Supplier posSupplier) { - if (reader instanceof SupportsRowPosition) { - ((SupportsRowPosition) reader).setRowPositionSupplier(posSupplier); - } - } - - private static class ReadBuilder extends AvroSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public ValueReader record( - Types.StructType expected, Schema record, List names, List> fields) { - return SparkValueReaders.struct(fields, expected, idToConstant); - } - - @Override - public ValueReader union(Type expected, Schema union, List> options) { - return ValueReaders.union(options); - } - - @Override - public ValueReader array( - Types.ListType expected, Schema array, ValueReader elementReader) { - return SparkValueReaders.array(elementReader); - } - - @Override - public ValueReader map( - Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { - return SparkValueReaders.arrayMap(keyReader, valueReader); - } - - @Override - public ValueReader map(Types.MapType expected, Schema map, ValueReader valueReader) { - return SparkValueReaders.map(SparkValueReaders.strings(), valueReader); - } - - @Override - public ValueReader primitive(Type.PrimitiveType expected, Schema primitive) { - LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - // Spark uses the same representation - return ValueReaders.ints(); - - case "timestamp-millis": - // adjust to microseconds - ValueReader longs = ValueReaders.longs(); - return (ValueReader) (decoder, ignored) -> longs.read(decoder, null) * 1000L; - - case "timestamp-micros": - // Spark uses the same representation - return ValueReaders.longs(); - - case "decimal": - return SparkValueReaders.decimal( - ValueReaders.decimalBytesReader(primitive), - ((LogicalTypes.Decimal) logicalType).getScale()); - - case "uuid": - return SparkValueReaders.uuids(); - - default: - throw new IllegalArgumentException("Unknown logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueReaders.nulls(); - case BOOLEAN: - return ValueReaders.booleans(); - case INT: - return ValueReaders.ints(); - case LONG: - return ValueReaders.longs(); - case FLOAT: - return ValueReaders.floats(); - case DOUBLE: - return ValueReaders.doubles(); - case STRING: - return SparkValueReaders.strings(); - case FIXED: - return ValueReaders.fixed(primitive.getFixedSize()); - case BYTES: - return ValueReaders.bytes(); - case ENUM: - return SparkValueReaders.enums(primitive.getEnumSymbols()); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java deleted file mode 100644 index 15465568c231..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.io.Encoder; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.avro.MetricsAwareDatumWriter; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.avro.ValueWriters; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.ByteType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.ShortType; -import org.apache.spark.sql.types.StructType; - -public class SparkAvroWriter implements MetricsAwareDatumWriter { - private final StructType dsSchema; - private ValueWriter writer = null; - - public SparkAvroWriter(StructType dsSchema) { - this.dsSchema = dsSchema; - } - - @Override - @SuppressWarnings("unchecked") - public void setSchema(Schema schema) { - this.writer = - (ValueWriter) - AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); - } - - @Override - public void write(InternalRow datum, Encoder out) throws IOException { - writer.write(datum, out); - } - - @Override - public Stream metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { - @Override - public ValueWriter record( - DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct( - fields, - IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()) - .collect(Collectors.toList())); - } - - @Override - public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument( - options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", - union); - Preconditions.checkArgument( - options.size() == 2, "Cannot create writer for non-option union: %s", union); - if (union.getTypes().get(0).getType() == Schema.Type.NULL) { - return ValueWriters.option(0, options.get(1)); - } else { - return ValueWriters.option(1, options.get(0)); - } - } - - @Override - public ValueWriter array(DataType sArray, Schema array, ValueWriter elementWriter) { - return SparkValueWriters.array(elementWriter, arrayElementType(sArray)); - } - - @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map( - SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); - } - - @Override - public ValueWriter map( - DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap( - keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); - } - - @Override - public ValueWriter primitive(DataType type, Schema primitive) { - LogicalType logicalType = primitive.getLogicalType(); - if (logicalType != null) { - switch (logicalType.getName()) { - case "date": - // Spark uses the same representation - return ValueWriters.ints(); - - case "timestamp-micros": - // Spark uses the same representation - return ValueWriters.longs(); - - case "decimal": - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - return SparkValueWriters.decimal(decimal.getPrecision(), decimal.getScale()); - - case "uuid": - return ValueWriters.uuids(); - - default: - throw new IllegalArgumentException("Unsupported logical type: " + logicalType); - } - } - - switch (primitive.getType()) { - case NULL: - return ValueWriters.nulls(); - case BOOLEAN: - return ValueWriters.booleans(); - case INT: - if (type instanceof ByteType) { - return ValueWriters.tinyints(); - } else if (type instanceof ShortType) { - return ValueWriters.shorts(); - } - return ValueWriters.ints(); - case LONG: - return ValueWriters.longs(); - case FLOAT: - return ValueWriters.floats(); - case DOUBLE: - return ValueWriters.doubles(); - case STRING: - return SparkValueWriters.strings(); - case FIXED: - return ValueWriters.fixed(primitive.getFixedSize()); - case BYTES: - return ValueWriters.bytes(); - default: - throw new IllegalArgumentException("Unsupported type: " + primitive); - } - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java deleted file mode 100644 index 78db137054bc..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.util.List; -import java.util.Map; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.StructColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -import org.apache.spark.sql.catalyst.InternalRow; - -/** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. - * - *

It minimizes allocations by reusing most of the objects in the implementation. - */ -public class SparkOrcReader implements OrcRowReader { - private final OrcValueReader reader; - - public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription readSchema) { - this(expectedSchema, readSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, - TypeDescription readOrcSchema, - Map idToConstant) { - this.reader = - OrcSchemaWithTypeVisitor.visit( - expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); - } - - @Override - public InternalRow read(VectorizedRowBatch batch, int row) { - return (InternalRow) reader.read(new StructColumnVector(batch.size, batch.cols), row); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - reader.setBatchContext(batchOffsetInFile); - } - - private static class ReadBuilder extends OrcSchemaWithTypeVisitor> { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public OrcValueReader record( - Types.StructType expected, - TypeDescription record, - List names, - List> fields) { - return SparkOrcValueReaders.struct(fields, expected, idToConstant); - } - - @Override - public OrcValueReader list( - Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { - return SparkOrcValueReaders.array(elementReader); - } - - @Override - public OrcValueReader map( - Types.MapType iMap, - TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { - return SparkOrcValueReaders.map(keyReader, valueReader); - } - - @Override - public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - switch (primitive.getCategory()) { - case BOOLEAN: - return OrcValueReaders.booleans(); - case BYTE: - // Iceberg does not have a byte type. Use int - case SHORT: - // Iceberg does not have a short type. Use int - case DATE: - case INT: - return OrcValueReaders.ints(); - case LONG: - return OrcValueReaders.longs(); - case FLOAT: - return OrcValueReaders.floats(); - case DOUBLE: - return OrcValueReaders.doubles(); - case TIMESTAMP_INSTANT: - case TIMESTAMP: - return SparkOrcValueReaders.timestampTzs(); - case DECIMAL: - return SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); - case CHAR: - case VARCHAR: - case STRING: - return SparkOrcValueReaders.utf8String(); - case BINARY: - return OrcValueReaders.bytes(); - default: - throw new IllegalArgumentException("Unhandled type " + primitive); - } - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java deleted file mode 100644 index 9e9b3e53bbcc..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.math.BigDecimal; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; -import org.apache.orc.storage.serde2.io.HiveDecimalWritable; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.GenericArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; - -public class SparkOrcValueReaders { - private SparkOrcValueReaders() {} - - public static OrcValueReader utf8String() { - return StringReader.INSTANCE; - } - - public static OrcValueReader timestampTzs() { - return TimestampTzReader.INSTANCE; - } - - public static OrcValueReader decimals(int precision, int scale) { - if (precision <= Decimal.MAX_LONG_DIGITS()) { - return new SparkOrcValueReaders.Decimal18Reader(precision, scale); - } else if (precision <= 38) { - return new SparkOrcValueReaders.Decimal38Reader(precision, scale); - } else { - throw new IllegalArgumentException("Invalid precision: " + precision); - } - } - - static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - static OrcValueReader array(OrcValueReader elementReader) { - return new ArrayReader(elementReader); - } - - static OrcValueReader map(OrcValueReader keyReader, OrcValueReader valueReader) { - return new MapReader(keyReader, valueReader); - } - - private static class ArrayReader implements OrcValueReader { - private final OrcValueReader elementReader; - - private ArrayReader(OrcValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public ArrayData nonNullRead(ColumnVector vector, int row) { - ListColumnVector listVector = (ListColumnVector) vector; - int offset = (int) listVector.offsets[row]; - int length = (int) listVector.lengths[row]; - List elements = Lists.newArrayListWithExpectedSize(length); - for (int c = 0; c < length; ++c) { - elements.add(elementReader.read(listVector.child, offset + c)); - } - return new GenericArrayData(elements.toArray()); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - elementReader.setBatchContext(batchOffsetInFile); - } - } - - private static class MapReader implements OrcValueReader { - private final OrcValueReader keyReader; - private final OrcValueReader valueReader; - - private MapReader(OrcValueReader keyReader, OrcValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public MapData nonNullRead(ColumnVector vector, int row) { - MapColumnVector mapVector = (MapColumnVector) vector; - int offset = (int) mapVector.offsets[row]; - long length = mapVector.lengths[row]; - List keys = Lists.newArrayListWithExpectedSize((int) length); - List values = Lists.newArrayListWithExpectedSize((int) length); - for (int c = 0; c < length; c++) { - keys.add(keyReader.read(mapVector.keys, offset + c)); - values.add(valueReader.read(mapVector.values, offset + c)); - } - - return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - keyReader.setBatchContext(batchOffsetInFile); - valueReader.setBatchContext(batchOffsetInFile); - } - } - - static class StructReader extends OrcValueReaders.StructReader { - private final int numFields; - - protected StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = struct.fields().size(); - } - - @Override - protected InternalRow create() { - return new GenericInternalRow(numFields); - } - - @Override - protected void set(InternalRow struct, int pos, Object value) { - if (value != null) { - struct.update(pos, value); - } else { - struct.setNullAt(pos); - } - } - } - - private static class StringReader implements OrcValueReader { - private static final StringReader INSTANCE = new StringReader(); - - private StringReader() {} - - @Override - public UTF8String nonNullRead(ColumnVector vector, int row) { - BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes( - bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); - } - } - - private static class TimestampTzReader implements OrcValueReader { - private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - - private TimestampTzReader() {} - - @Override - public Long nonNullRead(ColumnVector vector, int row) { - TimestampColumnVector tcv = (TimestampColumnVector) vector; - return Math.floorDiv(tcv.time[row], 1_000) * 1_000_000 + Math.floorDiv(tcv.nanos[row], 1000); - } - } - - private static class Decimal18Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal18Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public Decimal nonNullRead(ColumnVector vector, int row) { - HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For - // data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and - // store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that - // value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and - // scale. - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return new Decimal().set(value.serialize64(scale), precision, scale); - } - } - - private static class Decimal38Reader implements OrcValueReader { - private final int precision; - private final int scale; - - Decimal38Reader(int precision, int scale) { - this.precision = precision; - this.scale = scale; - } - - @Override - public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = - ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument( - value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", - precision, - scale, - value); - - return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java deleted file mode 100644 index 780090f99109..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.util.List; -import java.util.stream.Stream; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.common.type.HiveDecimal; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; - -class SparkOrcValueWriters { - private SparkOrcValueWriters() {} - - static OrcValueWriter strings() { - return StringWriter.INSTANCE; - } - - static OrcValueWriter timestampTz() { - return TimestampTzWriter.INSTANCE; - } - - static OrcValueWriter decimal(int precision, int scale) { - if (precision <= 18) { - return new Decimal18Writer(scale); - } else { - return new Decimal38Writer(); - } - } - - static OrcValueWriter list(OrcValueWriter element, List orcType) { - return new ListWriter<>(element, orcType); - } - - static OrcValueWriter map( - OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { - return new MapWriter<>(keyWriter, valueWriter, orcTypes); - } - - private static class StringWriter implements OrcValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - @Override - public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { - byte[] value = data.getBytes(); - ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); - } - } - - private static class TimestampTzWriter implements OrcValueWriter { - private static final TimestampTzWriter INSTANCE = new TimestampTzWriter(); - - @Override - public void nonNullWrite(int rowId, Long micros, ColumnVector output) { - TimestampColumnVector cv = (TimestampColumnVector) output; - cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis - cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos - } - } - - private static class Decimal18Writer implements OrcValueWriter { - private final int scale; - - Decimal18Writer(int scale) { - this.scale = scale; - } - - @Override - public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output) - .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); - } - } - - private static class Decimal38Writer implements OrcValueWriter { - - @Override - public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output) - .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); - } - } - - private static class ListWriter implements OrcValueWriter { - private final OrcValueWriter writer; - private final SparkOrcWriter.FieldGetter fieldGetter; - - @SuppressWarnings("unchecked") - ListWriter(OrcValueWriter writer, List orcTypes) { - if (orcTypes.size() != 1) { - throw new IllegalArgumentException( - "Expected one (and same) ORC type for list elements, got: " + orcTypes); - } - this.writer = writer; - this.fieldGetter = - (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - } - - @Override - public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { - ListColumnVector cv = (ListColumnVector) output; - // record the length and start of the list elements - cv.lengths[rowId] = value.numElements(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough - growColumnVector(cv.child, cv.childCount); - // Add each element - for (int e = 0; e < cv.lengths[rowId]; ++e) { - writer.write((int) (e + cv.offsets[rowId]), fieldGetter.getFieldOrNull(value, e), cv.child); - } - } - - @Override - public Stream> metrics() { - return writer.metrics(); - } - } - - private static class MapWriter implements OrcValueWriter { - private final OrcValueWriter keyWriter; - private final OrcValueWriter valueWriter; - private final SparkOrcWriter.FieldGetter keyFieldGetter; - private final SparkOrcWriter.FieldGetter valueFieldGetter; - - @SuppressWarnings("unchecked") - MapWriter( - OrcValueWriter keyWriter, - OrcValueWriter valueWriter, - List orcTypes) { - if (orcTypes.size() != 2) { - throw new IllegalArgumentException( - "Expected two ORC type descriptions for a map, got: " + orcTypes); - } - this.keyWriter = keyWriter; - this.valueWriter = valueWriter; - this.keyFieldGetter = - (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = - (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); - } - - @Override - public void nonNullWrite(int rowId, MapData map, ColumnVector output) { - ArrayData key = map.keyArray(); - ArrayData value = map.valueArray(); - MapColumnVector cv = (MapColumnVector) output; - // record the length and start of the list elements - cv.lengths[rowId] = value.numElements(); - cv.offsets[rowId] = cv.childCount; - cv.childCount = (int) (cv.childCount + cv.lengths[rowId]); - // make sure the child is big enough - growColumnVector(cv.keys, cv.childCount); - growColumnVector(cv.values, cv.childCount); - // Add each element - for (int e = 0; e < cv.lengths[rowId]; ++e) { - int pos = (int) (e + cv.offsets[rowId]); - keyWriter.write(pos, keyFieldGetter.getFieldOrNull(key, e), cv.keys); - valueWriter.write(pos, valueFieldGetter.getFieldOrNull(value, e), cv.values); - } - } - - @Override - public Stream> metrics() { - return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); - } - } - - private static void growColumnVector(ColumnVector cv, int requestedSize) { - if (cv.isNull.length < requestedSize) { - // Use growth factor of 3 to avoid frequent array allocations - cv.ensureSize(requestedSize * 3, true); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java deleted file mode 100644 index 60868b8700a3..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.Serializable; -import java.util.List; -import java.util.stream.Stream; -import javax.annotation.Nullable; -import org.apache.iceberg.FieldMetrics; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.orc.GenericOrcWriters; -import org.apache.iceberg.orc.ORCSchemaUtil; -import org.apache.iceberg.orc.OrcRowWriter; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; - -/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ -public class SparkOrcWriter implements OrcRowWriter { - - private final InternalRowWriter writer; - - public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument( - orcSchema.getCategory() == TypeDescription.Category.STRUCT, - "Top level must be a struct " + orcSchema); - - writer = - (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); - } - - @Override - public void write(InternalRow value, VectorizedRowBatch output) { - Preconditions.checkArgument(value != null, "value must not be null"); - writer.writeRow(value, output); - } - - @Override - public List> writers() { - return writer.writers(); - } - - @Override - public Stream> metrics() { - return writer.metrics(); - } - - private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() {} - - @Override - public OrcValueWriter record( - Types.StructType iStruct, - TypeDescription record, - List names, - List> fields) { - return new InternalRowWriter(fields, record.getChildren()); - } - - @Override - public OrcValueWriter list( - Types.ListType iList, TypeDescription array, OrcValueWriter element) { - return SparkOrcValueWriters.list(element, array.getChildren()); - } - - @Override - public OrcValueWriter map( - Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { - return SparkOrcValueWriters.map(key, value, map.getChildren()); - } - - @Override - public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - switch (primitive.getCategory()) { - case BOOLEAN: - return GenericOrcWriters.booleans(); - case BYTE: - return GenericOrcWriters.bytes(); - case SHORT: - return GenericOrcWriters.shorts(); - case DATE: - case INT: - return GenericOrcWriters.ints(); - case LONG: - return GenericOrcWriters.longs(); - case FLOAT: - return GenericOrcWriters.floats(ORCSchemaUtil.fieldId(primitive)); - case DOUBLE: - return GenericOrcWriters.doubles(ORCSchemaUtil.fieldId(primitive)); - case BINARY: - return GenericOrcWriters.byteArrays(); - case STRING: - case CHAR: - case VARCHAR: - return SparkOrcValueWriters.strings(); - case DECIMAL: - return SparkOrcValueWriters.decimal(primitive.getPrecision(), primitive.getScale()); - case TIMESTAMP_INSTANT: - case TIMESTAMP: - return SparkOrcValueWriters.timestampTz(); - default: - throw new IllegalArgumentException("Unhandled type " + primitive); - } - } - } - - private static class InternalRowWriter extends GenericOrcWriters.StructWriter { - private final List> fieldGetters; - - InternalRowWriter(List> writers, List orcTypes) { - super(writers); - this.fieldGetters = Lists.newArrayListWithExpectedSize(orcTypes.size()); - - for (TypeDescription orcType : orcTypes) { - fieldGetters.add(createFieldGetter(orcType)); - } - } - - @Override - protected Object get(InternalRow struct, int index) { - return fieldGetters.get(index).getFieldOrNull(struct, index); - } - } - - static FieldGetter createFieldGetter(TypeDescription fieldType) { - final FieldGetter fieldGetter; - switch (fieldType.getCategory()) { - case BOOLEAN: - fieldGetter = SpecializedGetters::getBoolean; - break; - case BYTE: - fieldGetter = SpecializedGetters::getByte; - break; - case SHORT: - fieldGetter = SpecializedGetters::getShort; - break; - case DATE: - case INT: - fieldGetter = SpecializedGetters::getInt; - break; - case LONG: - case TIMESTAMP: - case TIMESTAMP_INSTANT: - fieldGetter = SpecializedGetters::getLong; - break; - case FLOAT: - fieldGetter = SpecializedGetters::getFloat; - break; - case DOUBLE: - fieldGetter = SpecializedGetters::getDouble; - break; - case BINARY: - fieldGetter = SpecializedGetters::getBinary; - // getBinary always makes a copy, so we don't need to worry about it - // being changed behind our back. - break; - case DECIMAL: - fieldGetter = - (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); - break; - case STRING: - case CHAR: - case VARCHAR: - fieldGetter = SpecializedGetters::getUTF8String; - break; - case STRUCT: - fieldGetter = (row, ordinal) -> row.getStruct(ordinal, fieldType.getChildren().size()); - break; - case LIST: - fieldGetter = SpecializedGetters::getArray; - break; - case MAP: - fieldGetter = SpecializedGetters::getMap; - break; - default: - throw new IllegalArgumentException( - "Encountered an unsupported ORC type during a write from Spark."); - } - - return (row, ordinal) -> { - if (row.isNullAt(ordinal)) { - return null; - } - return fieldGetter.getFieldOrNull(row, ordinal); - }; - } - - interface FieldGetter extends Serializable { - - /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls - * the appropriate getter for the expected data type. - * - * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in - * ArrayData, etc..) - * @return field value at ordinal - */ - @Nullable - T getFieldOrNull(SpecializedGetters row, int ordinal); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java deleted file mode 100644 index bba68684a303..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.parquet.ParquetUtil; -import org.apache.iceberg.parquet.ParquetValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders; -import org.apache.iceberg.parquet.ParquetValueReaders.FloatAsDoubleReader; -import org.apache.iceberg.parquet.ParquetValueReaders.IntAsLongReader; -import org.apache.iceberg.parquet.ParquetValueReaders.PrimitiveReader; -import org.apache.iceberg.parquet.ParquetValueReaders.RepeatedKeyValueReader; -import org.apache.iceberg.parquet.ParquetValueReaders.RepeatedReader; -import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; -import org.apache.iceberg.parquet.ParquetValueReaders.StructReader; -import org.apache.iceberg.parquet.ParquetValueReaders.UnboxedReader; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type.TypeID; -import org.apache.iceberg.types.Types; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.DecimalMetadata; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.GenericArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.CalendarInterval; -import org.apache.spark.unsafe.types.UTF8String; - -public class SparkParquetReaders { - private SparkParquetReaders() {} - - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema) { - return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); - } - - @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader( - Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - if (ParquetSchemaUtil.hasIds(fileSchema)) { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); - } else { - return (ParquetValueReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), - fileSchema, - new FallbackReadBuilder(fileSchema, idToConstant)); - } - } - - private static class FallbackReadBuilder extends ReadBuilder { - FallbackReadBuilder(MessageType type, Map idToConstant) { - super(type, idToConstant); - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - // the top level matches by ID, but the remaining IDs are missing - return super.struct(expected, message, fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType ignored, GroupType struct, List> fieldReaders) { - // the expected struct is ignored because nested fields are never found when the - List> newFields = - Lists.newArrayListWithExpectedSize(fieldReaders.size()); - List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - int fieldD = type().getMaxDefinitionLevel(path(fieldType.getName())) - 1; - newFields.add(ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - types.add(fieldType); - } - - return new InternalRowReader(types, newFields); - } - } - - private static class ReadBuilder extends TypeWithSchemaVisitor> { - private final MessageType type; - private final Map idToConstant; - - ReadBuilder(MessageType type, Map idToConstant) { - this.type = type; - this.idToConstant = idToConstant; - } - - @Override - public ParquetValueReader message( - Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); - } - - @Override - public ParquetValueReader struct( - Types.StructType expected, GroupType struct, List> fieldReaders) { - // match the expected struct's order - Map> readersById = Maps.newHashMap(); - Map typesById = Maps.newHashMap(); - Map maxDefinitionLevelsById = Maps.newHashMap(); - List fields = struct.getFields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i); - int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1; - if (fieldType.getId() != null) { - int id = fieldType.getId().intValue(); - readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); - typesById.put(id, fieldType); - if (idToConstant.containsKey(id)) { - maxDefinitionLevelsById.put(id, fieldD); - } - } - } - - List expectedFields = - expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = - Lists.newArrayListWithExpectedSize(expectedFields.size()); - List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); - // Defaulting to parent max definition level - int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); - for (Types.NestedField field : expectedFields) { - int id = field.fieldId(); - if (idToConstant.containsKey(id)) { - // containsKey is used because the constant may be null - int fieldMaxDefinitionLevel = - maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); - reorderedFields.add( - ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); - types.add(null); - } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { - reorderedFields.add(ParquetValueReaders.position()); - types.add(null); - } else if (id == MetadataColumns.IS_DELETED.fieldId()) { - reorderedFields.add(ParquetValueReaders.constant(false)); - types.add(null); - } else { - ParquetValueReader reader = readersById.get(id); - if (reader != null) { - reorderedFields.add(reader); - types.add(typesById.get(id)); - } else { - reorderedFields.add(ParquetValueReaders.nulls()); - types.add(null); - } - } - } - - return new InternalRowReader(types, reorderedFields); - } - - @Override - public ParquetValueReader list( - Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type elementType = repeated.getType(0); - int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - - return new ArrayReader<>( - repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); - } - - @Override - public ParquetValueReader map( - Types.MapType expectedMap, - GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; - int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; - - Type keyType = repeatedKeyValue.getType(0); - int keyD = type.getMaxDefinitionLevel(path(keyType.getName())) - 1; - Type valueType = repeatedKeyValue.getType(1); - int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - - return new MapReader<>( - repeatedD, - repeatedR, - ParquetValueReaders.option(keyType, keyD, keyReader), - ParquetValueReaders.option(valueType, valueD, valueReader)); - } - - @Override - public ParquetValueReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return new StringReader(desc); - case INT_8: - case INT_16: - case INT_32: - if (expected != null && expected.typeId() == Types.LongType.get().typeId()) { - return new IntAsLongReader(desc); - } else { - return new UnboxedReader(desc); - } - case DATE: - case INT_64: - case TIMESTAMP_MICROS: - return new UnboxedReader<>(desc); - case TIMESTAMP_MILLIS: - return new TimestampMillisReader(desc); - case DECIMAL: - DecimalMetadata decimal = primitive.getDecimalMetadata(); - switch (primitive.getPrimitiveTypeName()) { - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return new BinaryDecimalReader(desc, decimal.getScale()); - case INT64: - return new LongDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - case INT32: - return new IntegerDecimalReader(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return new ParquetValueReaders.ByteArrayReader(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return new ParquetValueReaders.ByteArrayReader(desc); - case INT32: - if (expected != null && expected.typeId() == TypeID.LONG) { - return new IntAsLongReader(desc); - } else { - return new UnboxedReader<>(desc); - } - case FLOAT: - if (expected != null && expected.typeId() == TypeID.DOUBLE) { - return new FloatAsDoubleReader(desc); - } else { - return new UnboxedReader<>(desc); - } - case BOOLEAN: - case INT64: - case DOUBLE: - return new UnboxedReader<>(desc); - case INT96: - // Impala & Spark used to write timestamps as INT96 without a logical type. For backwards - // compatibility we try to read INT96 as timestamps. - return new TimestampInt96Reader(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - - protected MessageType type() { - return type; - } - } - - private static class BinaryDecimalReader extends PrimitiveReader { - private final int scale; - - BinaryDecimalReader(ColumnDescriptor desc, int scale) { - super(desc); - this.scale = scale; - } - - @Override - public Decimal read(Decimal ignored) { - Binary binary = column.nextBinary(); - return Decimal.fromDecimal(new BigDecimal(new BigInteger(binary.getBytes()), scale)); - } - } - - private static class IntegerDecimalReader extends PrimitiveReader { - private final int precision; - private final int scale; - - IntegerDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public Decimal read(Decimal ignored) { - return Decimal.apply(column.nextInteger(), precision, scale); - } - } - - private static class LongDecimalReader extends PrimitiveReader { - private final int precision; - private final int scale; - - LongDecimalReader(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public Decimal read(Decimal ignored) { - return Decimal.apply(column.nextLong(), precision, scale); - } - } - - private static class TimestampMillisReader extends UnboxedReader { - TimestampMillisReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Long read(Long ignored) { - return readLong(); - } - - @Override - public long readLong() { - return 1000 * column.nextLong(); - } - } - - private static class TimestampInt96Reader extends UnboxedReader { - - TimestampInt96Reader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public Long read(Long ignored) { - return readLong(); - } - - @Override - public long readLong() { - final ByteBuffer byteBuffer = - column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); - return ParquetUtil.extractTimestampInt96(byteBuffer); - } - } - - private static class StringReader extends PrimitiveReader { - StringReader(ColumnDescriptor desc) { - super(desc); - } - - @Override - public UTF8String read(UTF8String ignored) { - Binary binary = column.nextBinary(); - ByteBuffer buffer = binary.toByteBuffer(); - if (buffer.hasArray()) { - return UTF8String.fromBytes( - buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); - } else { - return UTF8String.fromBytes(binary.getBytes()); - } - } - } - - private static class ArrayReader extends RepeatedReader { - private int readPos = 0; - private int writePos = 0; - - ArrayReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { - super(definitionLevel, repetitionLevel, reader); - } - - @Override - @SuppressWarnings("unchecked") - protected ReusableArrayData newListData(ArrayData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableArrayData) { - return (ReusableArrayData) reuse; - } else { - return new ReusableArrayData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected E getElement(ReusableArrayData list) { - E value = null; - if (readPos < list.capacity()) { - value = (E) list.values[readPos]; - } - - readPos += 1; - - return value; - } - - @Override - protected void addElement(ReusableArrayData reused, E element) { - if (writePos >= reused.capacity()) { - reused.grow(); - } - - reused.values[writePos] = element; - - writePos += 1; - } - - @Override - protected ArrayData buildList(ReusableArrayData list) { - list.setNumElements(writePos); - return list; - } - } - - private static class MapReader - extends RepeatedKeyValueReader { - private int readPos = 0; - private int writePos = 0; - - private final ReusableEntry entry = new ReusableEntry<>(); - private final ReusableEntry nullEntry = new ReusableEntry<>(); - - MapReader( - int definitionLevel, - int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { - super(definitionLevel, repetitionLevel, keyReader, valueReader); - } - - @Override - @SuppressWarnings("unchecked") - protected ReusableMapData newMapData(MapData reuse) { - this.readPos = 0; - this.writePos = 0; - - if (reuse instanceof ReusableMapData) { - return (ReusableMapData) reuse; - } else { - return new ReusableMapData(); - } - } - - @Override - @SuppressWarnings("unchecked") - protected Map.Entry getPair(ReusableMapData map) { - Map.Entry kv = nullEntry; - if (readPos < map.capacity()) { - entry.set((K) map.keys.values[readPos], (V) map.values.values[readPos]); - kv = entry; - } - - readPos += 1; - - return kv; - } - - @Override - protected void addPair(ReusableMapData map, K key, V value) { - if (writePos >= map.capacity()) { - map.grow(); - } - - map.keys.values[writePos] = key; - map.values.values[writePos] = value; - - writePos += 1; - } - - @Override - protected MapData buildMap(ReusableMapData map) { - map.setNumElements(writePos); - return map; - } - } - - private static class InternalRowReader extends StructReader { - private final int numFields; - - InternalRowReader(List types, List> readers) { - super(types, readers); - this.numFields = readers.size(); - } - - @Override - protected GenericInternalRow newStructData(InternalRow reuse) { - if (reuse instanceof GenericInternalRow) { - return (GenericInternalRow) reuse; - } else { - return new GenericInternalRow(numFields); - } - } - - @Override - protected Object getField(GenericInternalRow intermediate, int pos) { - return intermediate.genericGet(pos); - } - - @Override - protected InternalRow buildStruct(GenericInternalRow struct) { - return struct; - } - - @Override - protected void set(GenericInternalRow row, int pos, Object value) { - row.update(pos, value); - } - - @Override - protected void setNull(GenericInternalRow row, int pos) { - row.setNullAt(pos); - } - - @Override - protected void setBoolean(GenericInternalRow row, int pos, boolean value) { - row.setBoolean(pos, value); - } - - @Override - protected void setInteger(GenericInternalRow row, int pos, int value) { - row.setInt(pos, value); - } - - @Override - protected void setLong(GenericInternalRow row, int pos, long value) { - row.setLong(pos, value); - } - - @Override - protected void setFloat(GenericInternalRow row, int pos, float value) { - row.setFloat(pos, value); - } - - @Override - protected void setDouble(GenericInternalRow row, int pos, double value) { - row.setDouble(pos, value); - } - } - - private static class ReusableMapData extends MapData { - private final ReusableArrayData keys; - private final ReusableArrayData values; - private int numElements; - - private ReusableMapData() { - this.keys = new ReusableArrayData(); - this.values = new ReusableArrayData(); - } - - private void grow() { - keys.grow(); - values.grow(); - } - - private int capacity() { - return keys.capacity(); - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - keys.setNumElements(numElements); - values.setNumElements(numElements); - } - - @Override - public int numElements() { - return numElements; - } - - @Override - public MapData copy() { - return new ArrayBasedMapData(keyArray().copy(), valueArray().copy()); - } - - @Override - public ReusableArrayData keyArray() { - return keys; - } - - @Override - public ReusableArrayData valueArray() { - return values; - } - } - - private static class ReusableArrayData extends ArrayData { - private static final Object[] EMPTY = new Object[0]; - - private Object[] values = EMPTY; - private int numElements = 0; - - private void grow() { - if (values.length == 0) { - this.values = new Object[20]; - } else { - Object[] old = values; - this.values = new Object[old.length << 2]; - // copy the old array in case it has values that can be reused - System.arraycopy(old, 0, values, 0, old.length); - } - } - - private int capacity() { - return values.length; - } - - public void setNumElements(int numElements) { - this.numElements = numElements; - } - - @Override - public Object get(int ordinal, DataType dataType) { - return values[ordinal]; - } - - @Override - public int numElements() { - return numElements; - } - - @Override - public ArrayData copy() { - return new GenericArrayData(array()); - } - - @Override - public Object[] array() { - return Arrays.copyOfRange(values, 0, numElements); - } - - @Override - public void setNullAt(int i) { - values[i] = null; - } - - @Override - public void update(int ordinal, Object value) { - values[ordinal] = value; - } - - @Override - public boolean isNullAt(int ordinal) { - return null == values[ordinal]; - } - - @Override - public boolean getBoolean(int ordinal) { - return (boolean) values[ordinal]; - } - - @Override - public byte getByte(int ordinal) { - return (byte) values[ordinal]; - } - - @Override - public short getShort(int ordinal) { - return (short) values[ordinal]; - } - - @Override - public int getInt(int ordinal) { - return (int) values[ordinal]; - } - - @Override - public long getLong(int ordinal) { - return (long) values[ordinal]; - } - - @Override - public float getFloat(int ordinal) { - return (float) values[ordinal]; - } - - @Override - public double getDouble(int ordinal) { - return (double) values[ordinal]; - } - - @Override - public Decimal getDecimal(int ordinal, int precision, int scale) { - return (Decimal) values[ordinal]; - } - - @Override - public UTF8String getUTF8String(int ordinal) { - return (UTF8String) values[ordinal]; - } - - @Override - public byte[] getBinary(int ordinal) { - return (byte[]) values[ordinal]; - } - - @Override - public CalendarInterval getInterval(int ordinal) { - return (CalendarInterval) values[ordinal]; - } - - @Override - public InternalRow getStruct(int ordinal, int numFields) { - return (InternalRow) values[ordinal]; - } - - @Override - public ArrayData getArray(int ordinal) { - return (ArrayData) values[ordinal]; - } - - @Override - public MapData getMap(int ordinal) { - return (MapData) values[ordinal]; - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java deleted file mode 100644 index c7622678c74d..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ /dev/null @@ -1,457 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; -import org.apache.iceberg.parquet.ParquetValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters; -import org.apache.iceberg.parquet.ParquetValueWriters.PrimitiveWriter; -import org.apache.iceberg.parquet.ParquetValueWriters.RepeatedKeyValueWriter; -import org.apache.iceberg.parquet.ParquetValueWriters.RepeatedWriter; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.io.api.Binary; -import org.apache.parquet.schema.DecimalMetadata; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.ByteType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.ShortType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -public class SparkParquetWriters { - private SparkParquetWriters() {} - - @SuppressWarnings("unchecked") - public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) - ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); - } - - private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { - private final MessageType type; - - WriteBuilder(MessageType type) { - this.type = type; - } - - @Override - public ParquetValueWriter message( - StructType sStruct, MessageType message, List> fieldWriters) { - return struct(sStruct, message.asGroupType(), fieldWriters); - } - - @Override - public ParquetValueWriter struct( - StructType sStruct, GroupType struct, List> fieldWriters) { - List fields = struct.getFields(); - StructField[] sparkFields = sStruct.fields(); - List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List sparkTypes = Lists.newArrayList(); - for (int i = 0; i < fields.size(); i += 1) { - writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - sparkTypes.add(sparkFields[i].dataType()); - } - - return new InternalRowWriter(writers, sparkTypes); - } - - @Override - public ParquetValueWriter list( - ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { - GroupType repeated = array.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new ArrayDataWriter<>( - repeatedD, - repeatedR, - newOption(repeated.getType(0), elementWriter), - sArray.elementType()); - } - - @Override - public ParquetValueWriter map( - MapType sMap, - GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { - GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); - String[] repeatedPath = currentPath(); - - int repeatedD = type.getMaxDefinitionLevel(repeatedPath); - int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - - return new MapDataWriter<>( - repeatedD, - repeatedR, - newOption(repeatedKeyValue.getType(0), keyWriter), - newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), - sMap.valueType()); - } - - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { - int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); - return ParquetValueWriters.option(fieldType, maxD, writer); - } - - @Override - public ParquetValueWriter primitive(DataType sType, PrimitiveType primitive) { - ColumnDescriptor desc = type.getColumnDescription(currentPath()); - - if (primitive.getOriginalType() != null) { - switch (primitive.getOriginalType()) { - case ENUM: - case JSON: - case UTF8: - return utf8Strings(desc); - case DATE: - case INT_8: - case INT_16: - case INT_32: - return ints(sType, desc); - case INT_64: - case TIME_MICROS: - case TIMESTAMP_MICROS: - return ParquetValueWriters.longs(desc); - case DECIMAL: - DecimalMetadata decimal = primitive.getDecimalMetadata(); - switch (primitive.getPrimitiveTypeName()) { - case INT32: - return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); - case INT64: - return decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); - default: - throw new UnsupportedOperationException( - "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); - } - case BSON: - return byteArrays(desc); - default: - throw new UnsupportedOperationException( - "Unsupported logical type: " + primitive.getOriginalType()); - } - } - - switch (primitive.getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: - case BINARY: - return byteArrays(desc); - case BOOLEAN: - return ParquetValueWriters.booleans(desc); - case INT32: - return ints(sType, desc); - case INT64: - return ParquetValueWriters.longs(desc); - case FLOAT: - return ParquetValueWriters.floats(desc); - case DOUBLE: - return ParquetValueWriters.doubles(desc); - default: - throw new UnsupportedOperationException("Unsupported type: " + primitive); - } - } - } - - private static PrimitiveWriter ints(DataType type, ColumnDescriptor desc) { - if (type instanceof ByteType) { - return ParquetValueWriters.tinyints(desc); - } else if (type instanceof ShortType) { - return ParquetValueWriters.shorts(desc); - } - return ParquetValueWriters.ints(desc); - } - - private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { - return new UTF8StringWriter(desc); - } - - private static PrimitiveWriter decimalAsInteger( - ColumnDescriptor desc, int precision, int scale) { - return new IntegerDecimalWriter(desc, precision, scale); - } - - private static PrimitiveWriter decimalAsLong( - ColumnDescriptor desc, int precision, int scale) { - return new LongDecimalWriter(desc, precision, scale); - } - - private static PrimitiveWriter decimalAsFixed( - ColumnDescriptor desc, int precision, int scale) { - return new FixedDecimalWriter(desc, precision, scale); - } - - private static PrimitiveWriter byteArrays(ColumnDescriptor desc) { - return new ByteArrayWriter(desc); - } - - private static class UTF8StringWriter extends PrimitiveWriter { - private UTF8StringWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, UTF8String value) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(value.getBytes())); - } - } - - private static class IntegerDecimalWriter extends PrimitiveWriter { - private final int precision; - private final int scale; - - private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); - } - } - - private static class LongDecimalWriter extends PrimitiveWriter { - private final int precision; - private final int scale; - - private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - } - - @Override - public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument( - decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", - precision, - scale, - decimal); - Preconditions.checkArgument( - decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", - precision, - scale, - decimal); - - column.writeLong(repetitionLevel, decimal.toUnscaledLong()); - } - } - - private static class FixedDecimalWriter extends PrimitiveWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { - super(desc); - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = - DecimalUtil.toReusedFixLengthBytes( - precision, scale, decimal.toJavaBigDecimal(), bytes.get()); - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); - } - } - - private static class ByteArrayWriter extends PrimitiveWriter { - private ByteArrayWriter(ColumnDescriptor desc) { - super(desc); - } - - @Override - public void write(int repetitionLevel, byte[] bytes) { - column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(bytes)); - } - } - - private static class ArrayDataWriter extends RepeatedWriter { - private final DataType elementType; - - private ArrayDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter writer, - DataType elementType) { - super(definitionLevel, repetitionLevel, writer); - this.elementType = elementType; - } - - @Override - protected Iterator elements(ArrayData list) { - return new ElementIterator<>(list); - } - - private class ElementIterator implements Iterator { - private final int size; - private final ArrayData list; - private int index; - - private ElementIterator(ArrayData list) { - this.list = list; - size = list.numElements(); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public E next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - E element; - if (list.isNullAt(index)) { - element = null; - } else { - element = (E) list.get(index, elementType); - } - - index += 1; - - return element; - } - } - } - - private static class MapDataWriter extends RepeatedKeyValueWriter { - private final DataType keyType; - private final DataType valueType; - - private MapDataWriter( - int definitionLevel, - int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter, - DataType keyType, - DataType valueType) { - super(definitionLevel, repetitionLevel, keyWriter, valueWriter); - this.keyType = keyType; - this.valueType = valueType; - } - - @Override - protected Iterator> pairs(MapData map) { - return new EntryIterator<>(map); - } - - private class EntryIterator implements Iterator> { - private final int size; - private final ArrayData keys; - private final ArrayData values; - private final ReusableEntry entry; - private int index; - - private EntryIterator(MapData map) { - size = map.numElements(); - keys = map.keyArray(); - values = map.valueArray(); - entry = new ReusableEntry<>(); - index = 0; - } - - @Override - public boolean hasNext() { - return index != size; - } - - @Override - @SuppressWarnings("unchecked") - public Map.Entry next() { - if (index >= size) { - throw new NoSuchElementException(); - } - - if (values.isNullAt(index)) { - entry.set((K) keys.get(index, keyType), null); - } else { - entry.set((K) keys.get(index, keyType), (V) values.get(index, valueType)); - } - - index += 1; - - return entry; - } - } - } - - private static class InternalRowWriter extends ParquetValueWriters.StructWriter { - private final DataType[] types; - - private InternalRowWriter(List> writers, List types) { - super(writers); - this.types = types.toArray(new DataType[types.size()]); - } - - @Override - protected Object get(InternalRow struct, int index) { - return struct.get(index, types[index]); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java deleted file mode 100644 index 11655c72d857..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.IOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Map; -import org.apache.avro.io.Decoder; -import org.apache.avro.util.Utf8; -import org.apache.iceberg.avro.ValueReader; -import org.apache.iceberg.avro.ValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.UUIDUtil; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.GenericArrayData; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; - -public class SparkValueReaders { - - private SparkValueReaders() {} - - static ValueReader strings() { - return StringReader.INSTANCE; - } - - static ValueReader enums(List symbols) { - return new EnumReader(symbols); - } - - static ValueReader uuids() { - return UUIDReader.INSTANCE; - } - - static ValueReader decimal(ValueReader unscaledReader, int scale) { - return new DecimalReader(unscaledReader, scale); - } - - static ValueReader array(ValueReader elementReader) { - return new ArrayReader(elementReader); - } - - static ValueReader arrayMap( - ValueReader keyReader, ValueReader valueReader) { - return new ArrayMapReader(keyReader, valueReader); - } - - static ValueReader map(ValueReader keyReader, ValueReader valueReader) { - return new MapReader(keyReader, valueReader); - } - - static ValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); - } - - private static class StringReader implements ValueReader { - private static final StringReader INSTANCE = new StringReader(); - - private StringReader() {} - - @Override - public UTF8String read(Decoder decoder, Object reuse) throws IOException { - // use the decoder's readString(Utf8) method because it may be a resolving decoder - Utf8 utf8 = null; - if (reuse instanceof UTF8String) { - utf8 = new Utf8(((UTF8String) reuse).getBytes()); - } - - Utf8 string = decoder.readString(utf8); - return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); - // int length = decoder.readInt(); - // byte[] bytes = new byte[length]; - // decoder.readFixed(bytes, 0, length); - // return UTF8String.fromBytes(bytes); - } - } - - private static class EnumReader implements ValueReader { - private final UTF8String[] symbols; - - private EnumReader(List symbols) { - this.symbols = new UTF8String[symbols.size()]; - for (int i = 0; i < this.symbols.length; i += 1) { - this.symbols[i] = UTF8String.fromBytes(symbols.get(i).getBytes(StandardCharsets.UTF_8)); - } - } - - @Override - public UTF8String read(Decoder decoder, Object ignore) throws IOException { - int index = decoder.readEnum(); - return symbols[index]; - } - } - - private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = - ThreadLocal.withInitial( - () -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); - - private static final UUIDReader INSTANCE = new UUIDReader(); - - private UUIDReader() {} - - @Override - @SuppressWarnings("ByteBufferBackingArray") - public UTF8String read(Decoder decoder, Object reuse) throws IOException { - ByteBuffer buffer = BUFFER.get(); - buffer.rewind(); - - decoder.readFixed(buffer.array(), 0, 16); - - return UTF8String.fromString(UUIDUtil.convert(buffer).toString()); - } - } - - private static class DecimalReader implements ValueReader { - private final ValueReader bytesReader; - private final int scale; - - private DecimalReader(ValueReader bytesReader, int scale) { - this.bytesReader = bytesReader; - this.scale = scale; - } - - @Override - public Decimal read(Decoder decoder, Object reuse) throws IOException { - byte[] bytes = bytesReader.read(decoder, null); - return Decimal.apply(new BigDecimal(new BigInteger(bytes), scale)); - } - } - - private static class ArrayReader implements ValueReader { - private final ValueReader elementReader; - private final List reusedList = Lists.newArrayList(); - - private ArrayReader(ValueReader elementReader) { - this.elementReader = elementReader; - } - - @Override - public GenericArrayData read(Decoder decoder, Object reuse) throws IOException { - reusedList.clear(); - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedList.add(elementReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - // this will convert the list to an array so it is okay to reuse the list - return new GenericArrayData(reusedList.toArray()); - } - } - - private static class ArrayMapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private ArrayMapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readArrayStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.arrayNext(); - } - - return new ArrayBasedMapData( - new GenericArrayData(reusedKeyList.toArray()), - new GenericArrayData(reusedValueList.toArray())); - } - } - - private static class MapReader implements ValueReader { - private final ValueReader keyReader; - private final ValueReader valueReader; - - private final List reusedKeyList = Lists.newArrayList(); - private final List reusedValueList = Lists.newArrayList(); - - private MapReader(ValueReader keyReader, ValueReader valueReader) { - this.keyReader = keyReader; - this.valueReader = valueReader; - } - - @Override - public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException { - reusedKeyList.clear(); - reusedValueList.clear(); - - long chunkLength = decoder.readMapStart(); - - while (chunkLength > 0) { - for (int i = 0; i < chunkLength; i += 1) { - reusedKeyList.add(keyReader.read(decoder, null)); - reusedValueList.add(valueReader.read(decoder, null)); - } - - chunkLength = decoder.mapNext(); - } - - return new ArrayBasedMapData( - new GenericArrayData(reusedKeyList.toArray()), - new GenericArrayData(reusedValueList.toArray())); - } - } - - static class StructReader extends ValueReaders.StructReader { - private final int numFields; - - protected StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); - this.numFields = readers.size(); - } - - @Override - protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow - && ((GenericInternalRow) reuse).numFields() == numFields) { - return (InternalRow) reuse; - } - return new GenericInternalRow(numFields); - } - - @Override - protected Object get(InternalRow struct, int pos) { - return null; - } - - @Override - protected void set(InternalRow struct, int pos, Object value) { - if (value != null) { - struct.update(pos, value); - } else { - struct.setNullAt(pos); - } - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java deleted file mode 100644 index 5f2e2c054888..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.IOException; -import java.lang.reflect.Array; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.List; -import java.util.UUID; -import org.apache.avro.io.Encoder; -import org.apache.avro.util.Utf8; -import org.apache.iceberg.avro.ValueWriter; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.DecimalUtil; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; - -public class SparkValueWriters { - - private SparkValueWriters() {} - - static ValueWriter strings() { - return StringWriter.INSTANCE; - } - - static ValueWriter uuids() { - return UUIDWriter.INSTANCE; - } - - static ValueWriter decimal(int precision, int scale) { - return new DecimalWriter(precision, scale); - } - - static ValueWriter array(ValueWriter elementWriter, DataType elementType) { - return new ArrayWriter<>(elementWriter, elementType); - } - - static ValueWriter arrayMap( - ValueWriter keyWriter, DataType keyType, ValueWriter valueWriter, DataType valueType) { - return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter map( - ValueWriter keyWriter, DataType keyType, ValueWriter valueWriter, DataType valueType) { - return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); - } - - static ValueWriter struct(List> writers, List types) { - return new StructWriter(writers, types); - } - - private static class StringWriter implements ValueWriter { - private static final StringWriter INSTANCE = new StringWriter(); - - private StringWriter() {} - - @Override - public void write(UTF8String s, Encoder encoder) throws IOException { - // use getBytes because it may return the backing byte array if available. - // otherwise, it copies to a new byte array, which is still cheaper than Avro - // calling toString, which incurs encoding costs - encoder.writeString(new Utf8(s.getBytes())); - } - } - - private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = - ThreadLocal.withInitial( - () -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); - - private static final UUIDWriter INSTANCE = new UUIDWriter(); - - private UUIDWriter() {} - - @Override - @SuppressWarnings("ByteBufferBackingArray") - public void write(UTF8String s, Encoder encoder) throws IOException { - // TODO: direct conversion from string to byte buffer - UUID uuid = UUID.fromString(s.toString()); - ByteBuffer buffer = BUFFER.get(); - buffer.rewind(); - buffer.putLong(uuid.getMostSignificantBits()); - buffer.putLong(uuid.getLeastSignificantBits()); - encoder.writeFixed(buffer.array()); - } - } - - private static class DecimalWriter implements ValueWriter { - private final int precision; - private final int scale; - private final ThreadLocal bytes; - - private DecimalWriter(int precision, int scale) { - this.precision = precision; - this.scale = scale; - this.bytes = - ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); - } - - @Override - public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed( - DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); - } - } - - private static class ArrayWriter implements ValueWriter { - private final ValueWriter elementWriter; - private final DataType elementType; - - private ArrayWriter(ValueWriter elementWriter, DataType elementType) { - this.elementWriter = elementWriter; - this.elementType = elementType; - } - - @Override - @SuppressWarnings("unchecked") - public void write(ArrayData array, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = array.numElements(); - encoder.setItemCount(numElements); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - elementWriter.write((T) array.get(i, elementType), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class ArrayMapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final DataType keyType; - private final DataType valueType; - - private ArrayMapWriter( - ValueWriter keyWriter, - DataType keyType, - ValueWriter valueWriter, - DataType valueType) { - this.keyWriter = keyWriter; - this.keyType = keyType; - this.valueWriter = valueWriter; - this.valueType = valueType; - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeArrayStart(); - int numElements = map.numElements(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyArray.get(i, keyType), encoder); - valueWriter.write((V) valueArray.get(i, valueType), encoder); - } - encoder.writeArrayEnd(); - } - } - - private static class MapWriter implements ValueWriter { - private final ValueWriter keyWriter; - private final ValueWriter valueWriter; - private final DataType keyType; - private final DataType valueType; - - private MapWriter( - ValueWriter keyWriter, - DataType keyType, - ValueWriter valueWriter, - DataType valueType) { - this.keyWriter = keyWriter; - this.keyType = keyType; - this.valueWriter = valueWriter; - this.valueType = valueType; - } - - @Override - @SuppressWarnings("unchecked") - public void write(MapData map, Encoder encoder) throws IOException { - encoder.writeMapStart(); - int numElements = map.numElements(); - encoder.setItemCount(numElements); - ArrayData keyArray = map.keyArray(); - ArrayData valueArray = map.valueArray(); - for (int i = 0; i < numElements; i += 1) { - encoder.startItem(); - keyWriter.write((K) keyArray.get(i, keyType), encoder); - valueWriter.write((V) valueArray.get(i, valueType), encoder); - } - encoder.writeMapEnd(); - } - } - - static class StructWriter implements ValueWriter { - private final ValueWriter[] writers; - private final DataType[] types; - - @SuppressWarnings("unchecked") - private StructWriter(List> writers, List types) { - this.writers = (ValueWriter[]) Array.newInstance(ValueWriter.class, writers.size()); - this.types = new DataType[writers.size()]; - for (int i = 0; i < writers.size(); i += 1) { - this.writers[i] = writers.get(i); - this.types[i] = types.get(i); - } - } - - ValueWriter[] writers() { - return writers; - } - - @Override - public void write(InternalRow row, Encoder encoder) throws IOException { - for (int i = 0; i < types.length; i += 1) { - if (row.isNullAt(i)) { - writers[i].write(null, encoder); - } else { - write(row, i, writers[i], encoder); - } - } - } - - @SuppressWarnings("unchecked") - private void write(InternalRow row, int pos, ValueWriter writer, Encoder encoder) - throws IOException { - writer.write((T) row.get(pos, types[pos]), encoder); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java deleted file mode 100644 index e32ebcb02bbc..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VarCharVector; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.iceberg.arrow.vectorized.GenericArrowVectorAccessorFactory; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ArrowColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.unsafe.types.UTF8String; - -final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory< - Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { - - ArrowVectorAccessorFactory() { - super( - DecimalFactoryImpl::new, - StringFactoryImpl::new, - StructChildFactoryImpl::new, - ArrayFactoryImpl::new); - } - - private static final class DecimalFactoryImpl implements DecimalFactory { - @Override - public Class getGenericClass() { - return Decimal.class; - } - - @Override - public Decimal ofLong(long value, int precision, int scale) { - return Decimal.apply(value, precision, scale); - } - - @Override - public Decimal ofBigDecimal(BigDecimal value, int precision, int scale) { - return Decimal.apply(value, precision, scale); - } - } - - private static final class StringFactoryImpl implements StringFactory { - @Override - public Class getGenericClass() { - return UTF8String.class; - } - - @Override - public UTF8String ofRow(VarCharVector vector, int rowId) { - int start = vector.getStartOffset(rowId); - int end = vector.getEndOffset(rowId); - - return UTF8String.fromAddress( - null, vector.getDataBuffer().memoryAddress() + start, end - start); - } - - @Override - public UTF8String ofBytes(byte[] bytes) { - return UTF8String.fromBytes(bytes); - } - - @Override - public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { - if (byteBuffer.hasArray()) { - return UTF8String.fromBytes( - byteBuffer.array(), - byteBuffer.arrayOffset() + byteBuffer.position(), - byteBuffer.remaining()); - } - byte[] bytes = new byte[byteBuffer.remaining()]; - byteBuffer.get(bytes); - return UTF8String.fromBytes(bytes); - } - } - - private static final class ArrayFactoryImpl - implements ArrayFactory { - @Override - public ArrowColumnVector ofChild(ValueVector childVector) { - return new ArrowColumnVector(childVector); - } - - @Override - public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int rowId) { - ArrowBuf offsets = vector.getOffsetBuffer(); - int index = rowId * ListVector.OFFSET_WIDTH; - int start = offsets.getInt(index); - int end = offsets.getInt(index + ListVector.OFFSET_WIDTH); - return new ColumnarArray(childData, start, end - start); - } - } - - private static final class StructChildFactoryImpl - implements StructChildFactory { - @Override - public Class getGenericClass() { - return ArrowColumnVector.class; - } - - @Override - public ArrowColumnVector of(ValueVector childVector) { - return new ArrowColumnVector(childVector); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java deleted file mode 100644 index 810fef81b5bb..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; -import org.apache.iceberg.arrow.vectorized.VectorHolder; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ArrowColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.unsafe.types.UTF8String; - -public class ArrowVectorAccessors { - - private static final ArrowVectorAccessorFactory factory = new ArrowVectorAccessorFactory(); - - static ArrowVectorAccessor - getVectorAccessor(VectorHolder holder) { - return factory.getVectorAccessor(holder); - } - - private ArrowVectorAccessors() {} -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java deleted file mode 100644 index f761b2eb551b..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import java.util.List; -import org.apache.iceberg.arrow.vectorized.BaseBatchReader; -import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader; -import org.apache.iceberg.parquet.VectorizedReader; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarBatch; - -/** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized - * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors - * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. - */ -public class ColumnarBatchReader extends BaseBatchReader { - - public ColumnarBatchReader(List> readers) { - super(readers); - } - - @Override - public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { - Preconditions.checkArgument( - numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); - ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length]; - - if (reuse == null) { - closeVectors(); - } - - for (int i = 0; i < readers.length; i += 1) { - vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead); - int numRowsInVector = vectorHolders[i].numValues(); - Preconditions.checkState( - numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", - numRowsInVector, - numRowsToRead); - arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); - } - ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors); - batch.setNumRows(numRowsToRead); - return batch; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java deleted file mode 100644 index 8a0b329ebd52..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.types.Type; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.sql.vectorized.ColumnarMap; -import org.apache.spark.unsafe.types.UTF8String; - -class ConstantColumnVector extends ColumnVector { - - private final Object constant; - private final int batchSize; - - ConstantColumnVector(Type type, int batchSize, Object constant) { - super(SparkSchemaUtil.convert(type)); - this.constant = constant; - this.batchSize = batchSize; - } - - @Override - public void close() {} - - @Override - public boolean hasNull() { - return constant == null; - } - - @Override - public int numNulls() { - return constant == null ? batchSize : 0; - } - - @Override - public boolean isNullAt(int rowId) { - return constant == null; - } - - @Override - public boolean getBoolean(int rowId) { - return (boolean) constant; - } - - @Override - public byte getByte(int rowId) { - return (byte) constant; - } - - @Override - public short getShort(int rowId) { - return (short) constant; - } - - @Override - public int getInt(int rowId) { - return (int) constant; - } - - @Override - public long getLong(int rowId) { - return (long) constant; - } - - @Override - public float getFloat(int rowId) { - return (float) constant; - } - - @Override - public double getDouble(int rowId) { - return (double) constant; - } - - @Override - public ColumnarArray getArray(int rowId) { - throw new UnsupportedOperationException("ConstantColumnVector only supports primitives"); - } - - @Override - public ColumnarMap getMap(int ordinal) { - throw new UnsupportedOperationException("ConstantColumnVector only supports primitives"); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - return (Decimal) constant; - } - - @Override - public UTF8String getUTF8String(int rowId) { - return (UTF8String) constant; - } - - @Override - public byte[] getBinary(int rowId) { - return (byte[]) constant; - } - - @Override - protected ColumnVector getChild(int ordinal) { - throw new UnsupportedOperationException("ConstantColumnVector only supports primitives"); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java deleted file mode 100644 index 33c1a5284818..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; -import org.apache.iceberg.arrow.vectorized.NullabilityHolder; -import org.apache.iceberg.arrow.vectorized.VectorHolder; -import org.apache.iceberg.arrow.vectorized.VectorHolder.ConstantVectorHolder; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ArrowColumnVector; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.sql.vectorized.ColumnarMap; -import org.apache.spark.unsafe.types.UTF8String; - -/** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily - * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks - * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in - * the Arrow vector. - */ -public class IcebergArrowColumnVector extends ColumnVector { - - private final ArrowVectorAccessor accessor; - private final NullabilityHolder nullabilityHolder; - - public IcebergArrowColumnVector(VectorHolder holder) { - super(SparkSchemaUtil.convert(holder.icebergType())); - this.nullabilityHolder = holder.nullabilityHolder(); - this.accessor = ArrowVectorAccessors.getVectorAccessor(holder); - } - - @Override - public void close() { - accessor.close(); - } - - @Override - public boolean hasNull() { - return nullabilityHolder.hasNulls(); - } - - @Override - public int numNulls() { - return nullabilityHolder.numNulls(); - } - - @Override - public boolean isNullAt(int rowId) { - return nullabilityHolder.isNullAt(rowId) == 1; - } - - @Override - public boolean getBoolean(int rowId) { - return accessor.getBoolean(rowId); - } - - @Override - public byte getByte(int rowId) { - throw new UnsupportedOperationException("Unsupported type - byte"); - } - - @Override - public short getShort(int rowId) { - throw new UnsupportedOperationException("Unsupported type - short"); - } - - @Override - public int getInt(int rowId) { - return accessor.getInt(rowId); - } - - @Override - public long getLong(int rowId) { - return accessor.getLong(rowId); - } - - @Override - public float getFloat(int rowId) { - return accessor.getFloat(rowId); - } - - @Override - public double getDouble(int rowId) { - return accessor.getDouble(rowId); - } - - @Override - public ColumnarArray getArray(int rowId) { - if (isNullAt(rowId)) { - return null; - } - return accessor.getArray(rowId); - } - - @Override - public ColumnarMap getMap(int rowId) { - throw new UnsupportedOperationException("Unsupported type - map"); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - if (isNullAt(rowId)) { - return null; - } - return accessor.getDecimal(rowId, precision, scale); - } - - @Override - public UTF8String getUTF8String(int rowId) { - if (isNullAt(rowId)) { - return null; - } - return accessor.getUTF8String(rowId); - } - - @Override - public byte[] getBinary(int rowId) { - if (isNullAt(rowId)) { - return null; - } - return accessor.getBinary(rowId); - } - - @Override - public ArrowColumnVector getChild(int ordinal) { - return accessor.childColumn(ordinal); - } - - static ColumnVector forHolder(VectorHolder holder, int numRows) { - return holder.isDummy() - ? new ConstantColumnVector( - Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) - : new IcebergArrowColumnVector(holder); - } - - public ArrowVectorAccessor - vectorAccessor() { - return accessor; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java deleted file mode 100644 index a4d878b63569..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.sql.vectorized.ColumnarMap; -import org.apache.spark.unsafe.types.UTF8String; - -public class RowPositionColumnVector extends ColumnVector { - - private final long batchOffsetInFile; - - RowPositionColumnVector(long batchOffsetInFile) { - super(SparkSchemaUtil.convert(Types.LongType.get())); - this.batchOffsetInFile = batchOffsetInFile; - } - - @Override - public void close() {} - - @Override - public boolean hasNull() { - return false; - } - - @Override - public int numNulls() { - return 0; - } - - @Override - public boolean isNullAt(int rowId) { - return false; - } - - @Override - public boolean getBoolean(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte getByte(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public short getShort(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public int getInt(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public long getLong(int rowId) { - return batchOffsetInFile + rowId; - } - - @Override - public float getFloat(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public double getDouble(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarArray getArray(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarMap getMap(int ordinal) { - throw new UnsupportedOperationException(); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - throw new UnsupportedOperationException(); - } - - @Override - public UTF8String getUTF8String(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getBinary(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - protected ColumnVector getChild(int ordinal) { - throw new UnsupportedOperationException(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java deleted file mode 100644 index 7c3b825a62e7..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ /dev/null @@ -1,459 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import java.util.List; -import java.util.Map; -import java.util.stream.IntStream; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.orc.OrcBatchReader; -import org.apache.iceberg.orc.OrcSchemaWithTypeVisitor; -import org.apache.iceberg.orc.OrcValueReader; -import org.apache.iceberg.orc.OrcValueReaders; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.SparkOrcValueReaders; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.ListColumnVector; -import org.apache.orc.storage.ql.exec.vector.MapColumnVector; -import org.apache.orc.storage.ql.exec.vector.StructColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarArray; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.apache.spark.sql.vectorized.ColumnarMap; -import org.apache.spark.unsafe.types.UTF8String; - -public class VectorizedSparkOrcReaders { - - private VectorizedSparkOrcReaders() {} - - public static OrcBatchReader buildReader( - Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { - Converter converter = - OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); - - return new OrcBatchReader() { - private long batchOffsetInFile; - - @Override - public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = - (BaseOrcColumnVector) - converter.convert( - new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = - new ColumnarBatch( - IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); - columnarBatch.setNumRows(batch.size); - return columnarBatch; - } - - @Override - public void setBatchContext(long batchOffsetInFile) { - this.batchOffsetInFile = batchOffsetInFile; - } - }; - } - - private interface Converter { - ColumnVector convert( - org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, - int batchSize, - long batchOffsetInFile); - } - - private static class ReadBuilder extends OrcSchemaWithTypeVisitor { - private final Map idToConstant; - - private ReadBuilder(Map idToConstant) { - this.idToConstant = idToConstant; - } - - @Override - public Converter record( - Types.StructType iStruct, - TypeDescription record, - List names, - List fields) { - return new StructConverter(iStruct, fields, idToConstant); - } - - @Override - public Converter list(Types.ListType iList, TypeDescription array, Converter element) { - return new ArrayConverter(iList, element); - } - - @Override - public Converter map(Types.MapType iMap, TypeDescription map, Converter key, Converter value) { - return new MapConverter(iMap, key, value); - } - - @Override - public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primitive) { - final OrcValueReader primitiveValueReader; - switch (primitive.getCategory()) { - case BOOLEAN: - primitiveValueReader = OrcValueReaders.booleans(); - break; - case BYTE: - // Iceberg does not have a byte type. Use int - case SHORT: - // Iceberg does not have a short type. Use int - case DATE: - case INT: - primitiveValueReader = OrcValueReaders.ints(); - break; - case LONG: - primitiveValueReader = OrcValueReaders.longs(); - break; - case FLOAT: - primitiveValueReader = OrcValueReaders.floats(); - break; - case DOUBLE: - primitiveValueReader = OrcValueReaders.doubles(); - break; - case TIMESTAMP_INSTANT: - case TIMESTAMP: - primitiveValueReader = SparkOrcValueReaders.timestampTzs(); - break; - case DECIMAL: - primitiveValueReader = - SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); - break; - case CHAR: - case VARCHAR: - case STRING: - primitiveValueReader = SparkOrcValueReaders.utf8String(); - break; - case BINARY: - primitiveValueReader = OrcValueReaders.bytes(); - break; - default: - throw new IllegalArgumentException("Unhandled type " + primitive); - } - return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector( - iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); - } - } - - private abstract static class BaseOrcColumnVector extends ColumnVector { - private final org.apache.orc.storage.ql.exec.vector.ColumnVector vector; - private final int batchSize; - private Integer numNulls; - - BaseOrcColumnVector( - Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { - super(SparkSchemaUtil.convert(type)); - this.vector = vector; - this.batchSize = batchSize; - } - - @Override - public void close() {} - - @Override - public boolean hasNull() { - return !vector.noNulls; - } - - @Override - public int numNulls() { - if (numNulls == null) { - numNulls = numNullsHelper(); - } - return numNulls; - } - - private int numNullsHelper() { - if (vector.isRepeating) { - if (vector.isNull[0]) { - return batchSize; - } else { - return 0; - } - } else if (vector.noNulls) { - return 0; - } else { - int count = 0; - for (int i = 0; i < batchSize; i++) { - if (vector.isNull[i]) { - count++; - } - } - return count; - } - } - - protected int getRowIndex(int rowId) { - return vector.isRepeating ? 0 : rowId; - } - - @Override - public boolean isNullAt(int rowId) { - return vector.isNull[getRowIndex(rowId)]; - } - - @Override - public boolean getBoolean(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte getByte(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public short getShort(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public int getInt(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public long getLong(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public float getFloat(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public double getDouble(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - throw new UnsupportedOperationException(); - } - - @Override - public UTF8String getUTF8String(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getBinary(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarArray getArray(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnarMap getMap(int rowId) { - throw new UnsupportedOperationException(); - } - - @Override - public ColumnVector getChild(int ordinal) { - throw new UnsupportedOperationException(); - } - } - - private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { - private final org.apache.orc.storage.ql.exec.vector.ColumnVector vector; - private final OrcValueReader primitiveValueReader; - private final long batchOffsetInFile; - - PrimitiveOrcColumnVector( - Type type, - int batchSize, - org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, - long batchOffsetInFile) { - super(type, batchSize, vector); - this.vector = vector; - this.primitiveValueReader = primitiveValueReader; - this.batchOffsetInFile = batchOffsetInFile; - } - - @Override - public boolean getBoolean(int rowId) { - return (Boolean) primitiveValueReader.read(vector, rowId); - } - - @Override - public int getInt(int rowId) { - return (Integer) primitiveValueReader.read(vector, rowId); - } - - @Override - public long getLong(int rowId) { - return (Long) primitiveValueReader.read(vector, rowId); - } - - @Override - public float getFloat(int rowId) { - return (Float) primitiveValueReader.read(vector, rowId); - } - - @Override - public double getDouble(int rowId) { - return (Double) primitiveValueReader.read(vector, rowId); - } - - @Override - public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the - // decimal type - // and return a Decimal with (precision,scale) of the decimal type? - return (Decimal) primitiveValueReader.read(vector, rowId); - } - - @Override - public UTF8String getUTF8String(int rowId) { - return (UTF8String) primitiveValueReader.read(vector, rowId); - } - - @Override - public byte[] getBinary(int rowId) { - return (byte[]) primitiveValueReader.read(vector, rowId); - } - } - - private static class ArrayConverter implements Converter { - private final Types.ListType listType; - private final Converter elementConverter; - - private ArrayConverter(Types.ListType listType, Converter elementConverter) { - this.listType = listType; - this.elementConverter = elementConverter; - } - - @Override - public ColumnVector convert( - org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - int batchSize, - long batchOffsetInFile) { - ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = - elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); - - return new BaseOrcColumnVector(listType, batchSize, vector) { - @Override - public ColumnarArray getArray(int rowId) { - int index = getRowIndex(rowId); - return new ColumnarArray( - elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); - } - }; - } - } - - private static class MapConverter implements Converter { - private final Types.MapType mapType; - private final Converter keyConverter; - private final Converter valueConverter; - - private MapConverter(Types.MapType mapType, Converter keyConverter, Converter valueConverter) { - this.mapType = mapType; - this.keyConverter = keyConverter; - this.valueConverter = valueConverter; - } - - @Override - public ColumnVector convert( - org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - int batchSize, - long batchOffsetInFile) { - MapColumnVector mapVector = (MapColumnVector) vector; - ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = - valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); - - return new BaseOrcColumnVector(mapType, batchSize, vector) { - @Override - public ColumnarMap getMap(int rowId) { - int index = getRowIndex(rowId); - return new ColumnarMap( - keyVector, - valueVector, - (int) mapVector.offsets[index], - (int) mapVector.lengths[index]); - } - }; - } - } - - private static class StructConverter implements Converter { - private final Types.StructType structType; - private final List fieldConverters; - private final Map idToConstant; - - private StructConverter( - Types.StructType structType, - List fieldConverters, - Map idToConstant) { - this.structType = structType; - this.fieldConverters = fieldConverters; - this.idToConstant = idToConstant; - } - - @Override - public ColumnVector convert( - org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - int batchSize, - long batchOffsetInFile) { - StructColumnVector structVector = (StructColumnVector) vector; - List fields = structType.fields(); - List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); - for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { - Types.NestedField field = fields.get(pos); - if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add( - new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); - } else if (field.equals(MetadataColumns.ROW_POSITION)) { - fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); - } else if (field.equals(MetadataColumns.IS_DELETED)) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); - } else { - fieldVectors.add( - fieldConverters - .get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); - vectorIndex++; - } - } - - return new BaseOrcColumnVector(structType, batchSize, vector) { - @Override - public ColumnVector getChild(int ordinal) { - return fieldVectors.get(ordinal); - } - }; - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java deleted file mode 100644 index bbb63e077bc6..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.vectorized; - -import java.util.Map; -import org.apache.iceberg.Schema; -import org.apache.iceberg.arrow.vectorized.VectorizedReaderBuilder; -import org.apache.iceberg.parquet.TypeWithSchemaVisitor; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.parquet.schema.MessageType; - -public class VectorizedSparkParquetReaders { - - private VectorizedSparkParquetReaders() {} - - public static ColumnarBatchReader buildReader( - Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { - return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); - } - - public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector, - Map idToConstant) { - return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit( - expectedSchema.asStruct(), - fileSchema, - new VectorizedReaderBuilder( - expectedSchema, - fileSchema, - setArrowValidityVector, - idToConstant, - ColumnarBatchReader::new)); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java deleted file mode 100644 index 2cab8ee238e0..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.Closeable; -import java.io.IOException; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Stream; -import org.apache.avro.generic.GenericData; -import org.apache.avro.util.Utf8; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Partitioning; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.encryption.EncryptedFiles; -import org.apache.iceberg.encryption.EncryptedInputFile; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.types.Types.StructType; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.iceberg.util.PartitionUtil; -import org.apache.spark.rdd.InputFileBlockHolder; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Base class of Spark readers. - * - * @param is the Java class returned by this reader whose objects contain one or more rows. - */ -abstract class BaseDataReader implements Closeable { - private static final Logger LOG = LoggerFactory.getLogger(BaseDataReader.class); - - private final Table table; - private final Iterator tasks; - private final Map inputFiles; - - private CloseableIterator currentIterator; - private T current = null; - private FileScanTask currentTask = null; - - BaseDataReader(Table table, CombinedScanTask task) { - this.table = table; - this.tasks = task.files().iterator(); - Map keyMetadata = Maps.newHashMap(); - task.files().stream() - .flatMap( - fileScanTask -> - Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) - .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = - keyMetadata.entrySet().stream() - .map( - entry -> - EncryptedFiles.encryptedInput( - table.io().newInputFile(entry.getKey()), entry.getValue())); - - // decrypt with the batch call to avoid multiple RPCs to a key server, if possible - Iterable decryptedFiles = table.encryption().decrypt(encrypted::iterator); - - Map files = Maps.newHashMapWithExpectedSize(task.files().size()); - decryptedFiles.forEach(decrypted -> files.putIfAbsent(decrypted.location(), decrypted)); - this.inputFiles = ImmutableMap.copyOf(files); - - this.currentIterator = CloseableIterator.empty(); - } - - public boolean next() throws IOException { - try { - while (true) { - if (currentIterator.hasNext()) { - this.current = currentIterator.next(); - return true; - } else if (tasks.hasNext()) { - this.currentIterator.close(); - this.currentTask = tasks.next(); - this.currentIterator = open(currentTask); - } else { - this.currentIterator.close(); - return false; - } - } - } catch (IOException | RuntimeException e) { - if (currentTask != null && !currentTask.isDataTask()) { - LOG.error("Error reading file: {}", getInputFile(currentTask).location(), e); - } - throw e; - } - } - - public T get() { - return current; - } - - abstract CloseableIterator open(FileScanTask task); - - @Override - public void close() throws IOException { - InputFileBlockHolder.unset(); - - // close the current iterator - this.currentIterator.close(); - - // exhaust the task iterator - while (tasks.hasNext()) { - tasks.next(); - } - } - - protected InputFile getInputFile(FileScanTask task) { - Preconditions.checkArgument(!task.isDataTask(), "Invalid task type"); - return inputFiles.get(task.file().path().toString()); - } - - protected InputFile getInputFile(String location) { - return inputFiles.get(location); - } - - protected Map constantsMap(FileScanTask task, Schema readSchema) { - if (readSchema.findField(MetadataColumns.PARTITION_COLUMN_ID) != null) { - StructType partitionType = Partitioning.partitionType(table); - return PartitionUtil.constantsMap(task, partitionType, BaseDataReader::convertConstant); - } else { - return PartitionUtil.constantsMap(task, BaseDataReader::convertConstant); - } - } - - protected static Object convertConstant(Type type, Object value) { - if (value == null) { - return null; - } - - switch (type.typeId()) { - case DECIMAL: - return Decimal.apply((BigDecimal) value); - case STRING: - if (value instanceof Utf8) { - Utf8 utf8 = (Utf8) value; - return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); - } - return UTF8String.fromString(value.toString()); - case FIXED: - if (value instanceof byte[]) { - return value; - } else if (value instanceof GenericData.Fixed) { - return ((GenericData.Fixed) value).bytes(); - } - return ByteBuffers.toByteArray((ByteBuffer) value); - case BINARY: - return ByteBuffers.toByteArray((ByteBuffer) value); - case STRUCT: - StructType structType = (StructType) type; - - if (structType.fields().isEmpty()) { - return new GenericInternalRow(); - } - - List fields = structType.fields(); - Object[] values = new Object[fields.size()]; - StructLike struct = (StructLike) value; - - for (int index = 0; index < fields.size(); index++) { - NestedField field = fields.get(index); - Type fieldType = field.type(); - values[index] = - convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); - } - - return new GenericInternalRow(values); - default: - } - return value; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java deleted file mode 100644 index d620faa979f6..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import java.util.Set; -import org.apache.arrow.vector.NullCheckingForGet; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; -import org.apache.iceberg.types.TypeUtil; -import org.apache.spark.rdd.InputFileBlockHolder; -import org.apache.spark.sql.vectorized.ColumnarBatch; - -class BatchDataReader extends BaseDataReader { - private final Schema expectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - private final int batchSize; - - BatchDataReader( - CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { - super(table, task); - this.expectedSchema = expectedSchema; - this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - this.caseSensitive = caseSensitive; - this.batchSize = size; - } - - @Override - CloseableIterator open(FileScanTask task) { - DataFile file = task.file(); - - // update the current file for Spark's filename() function - InputFileBlockHolder.set(file.path().toString(), task.start(), task.length()); - - Map idToConstant = constantsMap(task, expectedSchema); - - CloseableIterable iter; - InputFile location = getInputFile(task); - Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); - if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = - Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc( - fileSchema -> - VectorizedSparkParquetReaders.buildReader( - expectedSchema, - fileSchema, /* setArrowValidityVector */ - NullCheckingForGet.NULL_CHECKING_ENABLED, - idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive) - // Spark eagerly consumes the batches. So the underlying memory allocated could be - // reused - // without worrying about subsequent reads clobbering over each other. This improves - // read performance as every batch read doesn't have to pay the cost of allocating - // memory. - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - iter = builder.build(); - } else if (task.file().format() == FileFormat.ORC) { - Set constantFieldIds = idToConstant.keySet(); - Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = - Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); - ORC.ReadBuilder builder = - ORC.read(location) - .project(schemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createBatchedReaderFunc( - fileSchema -> - VectorizedSparkOrcReaders.buildReader( - expectedSchema, fileSchema, idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - iter = builder.build(); - } else { - throw new UnsupportedOperationException( - "Format: " + task.file().format() + " not supported for batched reads"); - } - return iter.iterator(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java deleted file mode 100644 index ecaf716f6a36..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.EnvironmentContext; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Splitter; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.util.Pair; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SparkSession; - -public final class CustomCatalogs { - private static final Cache, Catalog> CATALOG_CACHE = - Caffeine.newBuilder().build(); - - public static final String ICEBERG_DEFAULT_CATALOG = "default_catalog"; - public static final String ICEBERG_CATALOG_PREFIX = "spark.sql.catalog"; - - private CustomCatalogs() {} - - /** - * Build an Iceberg {@link Catalog} to be used by this Spark source adapter. - * - *

The cache is to facilitate reuse of catalogs, especially if wrapped in CachingCatalog. For - * non-Hive catalogs all custom parameters passed to the catalog are considered in the cache key. - * Hive catalogs only cache based on the Metastore URIs as per previous behaviour. - * - * @param spark Spark Session - * @param name Catalog Name - * @return an Iceberg catalog - */ - public static Catalog loadCatalog(SparkSession spark, String name) { - return CATALOG_CACHE.get(Pair.of(spark, name), CustomCatalogs::buildCatalog); - } - - private static Catalog buildCatalog(Pair sparkAndName) { - SparkSession spark = sparkAndName.first(); - String name = sparkAndName.second(); - SparkConf sparkConf = spark.sparkContext().getConf(); - Configuration conf = SparkUtil.hadoopConfCatalogOverrides(spark, name); - - String catalogPrefix = String.format("%s.%s", ICEBERG_CATALOG_PREFIX, name); - if (!name.equals(ICEBERG_DEFAULT_CATALOG) && !sparkConf.contains(catalogPrefix)) { - // we return null if spark.sql.catalog. is not the Spark Catalog - // and we aren't looking for the default catalog - return null; - } - - Map options = - Arrays.stream(sparkConf.getAllWithPrefix(catalogPrefix + ".")) - .collect(Collectors.toMap(x -> x._1, x -> x._2)); - - EnvironmentContext.put(EnvironmentContext.ENGINE_NAME, "spark"); - EnvironmentContext.put(EnvironmentContext.ENGINE_VERSION, spark.sparkContext().version()); - EnvironmentContext.put(CatalogProperties.APP_ID, spark.sparkContext().applicationId()); - - return CatalogUtil.buildIcebergCatalog(name, options, conf); - } - - public static Table table(SparkSession spark, String path) { - Pair catalogAndTableIdentifier = catalogAndIdentifier(spark, path); - return catalogAndTableIdentifier.first().loadTable(catalogAndTableIdentifier.second()); - } - - private static Pair catalogAndIdentifier( - SparkSession spark, String path) { - String[] currentNamespace = new String[] {spark.catalog().currentDatabase()}; - List nameParts = Splitter.on(".").splitToList(path); - return SparkUtil.catalogAndIdentifier( - nameParts, - s -> loadCatalog(spark, s), - (n, t) -> TableIdentifier.of(Namespace.of(n), t), - loadCatalog(spark, ICEBERG_DEFAULT_CATALOG), - currentNamespace); - } - - @VisibleForTesting - static void clearCache() { - CATALOG_CACHE.invalidateAll(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java deleted file mode 100644 index 1c55e1b8ebe2..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.spark.rdd.InputFileBlockHolder; -import org.apache.spark.sql.catalyst.InternalRow; - -public class EqualityDeleteRowReader extends RowDataReader { - private final Schema expectedSchema; - - public EqualityDeleteRowReader( - CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { - super(task, table, table.schema(), caseSensitive); - this.expectedSchema = expectedSchema; - } - - @Override - CloseableIterator open(FileScanTask task) { - SparkDeleteFilter matches = new SparkDeleteFilter(task, tableSchema(), expectedSchema); - - // schema or rows returned by readers - Schema requiredSchema = matches.requiredSchema(); - Map idToConstant = constantsMap(task, expectedSchema); - DataFile file = task.file(); - - // update the current file for Spark's filename() function - InputFileBlockHolder.set(file.path().toString(), task.start(), task.length()); - - return matches.findEqualityDeleteRows(open(task, requiredSchema, idToConstant)).iterator(); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java deleted file mode 100644 index f3ceee176a94..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import java.util.Optional; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.spark.SparkWriteConf; -import org.apache.iceberg.types.TypeUtil; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.execution.streaming.StreamExecution; -import org.apache.spark.sql.sources.DataSourceRegister; -import org.apache.spark.sql.sources.v2.DataSourceOptions; -import org.apache.spark.sql.sources.v2.DataSourceV2; -import org.apache.spark.sql.sources.v2.ReadSupport; -import org.apache.spark.sql.sources.v2.StreamWriteSupport; -import org.apache.spark.sql.sources.v2.WriteSupport; -import org.apache.spark.sql.sources.v2.reader.DataSourceReader; -import org.apache.spark.sql.sources.v2.writer.DataSourceWriter; -import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter; -import org.apache.spark.sql.streaming.OutputMode; -import org.apache.spark.sql.types.StructType; - -public class IcebergSource - implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister, StreamWriteSupport { - - private SparkSession lazySpark = null; - private Configuration lazyConf = null; - - @Override - public String shortName() { - return "iceberg"; - } - - @Override - public DataSourceReader createReader(DataSourceOptions options) { - return createReader(null, options); - } - - @Override - public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) { - Configuration conf = new Configuration(lazyBaseConf()); - Table table = getTableAndResolveHadoopConfiguration(options, conf); - String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive"); - - Reader reader = - new Reader(lazySparkSession(), table, Boolean.parseBoolean(caseSensitive), options); - if (readSchema != null) { - // convert() will fail if readSchema contains fields not in reader.snapshotSchema() - SparkSchemaUtil.convert(reader.snapshotSchema(), readSchema); - reader.pruneColumns(readSchema); - } - - return reader; - } - - @Override - public Optional createWriter( - String jobId, StructType dsStruct, SaveMode mode, DataSourceOptions options) { - Preconditions.checkArgument( - mode == SaveMode.Append || mode == SaveMode.Overwrite, - "Save mode %s is not supported", - mode); - Configuration conf = new Configuration(lazyBaseConf()); - Table table = getTableAndResolveHadoopConfiguration(options, conf); - SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); - - Preconditions.checkArgument( - writeConf.handleTimestampWithoutZone() - || !SparkUtil.hasTimestampWithoutZone(table.schema()), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); - - Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema( - table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); - SparkUtil.validatePartitionTransforms(table.spec()); - String appId = lazySparkSession().sparkContext().applicationId(); - String wapId = writeConf.wapId(); - boolean replacePartitions = mode == SaveMode.Overwrite; - - return Optional.of( - new Writer( - lazySparkSession(), - table, - writeConf, - replacePartitions, - appId, - wapId, - writeSchema, - dsStruct)); - } - - @Override - public StreamWriter createStreamWriter( - String runId, StructType dsStruct, OutputMode mode, DataSourceOptions options) { - Preconditions.checkArgument( - mode == OutputMode.Append() || mode == OutputMode.Complete(), - "Output mode %s is not supported", - mode); - Configuration conf = new Configuration(lazyBaseConf()); - Table table = getTableAndResolveHadoopConfiguration(options, conf); - SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); - - Preconditions.checkArgument( - writeConf.handleTimestampWithoutZone() - || !SparkUtil.hasTimestampWithoutZone(table.schema()), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); - - Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema( - table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); - SparkUtil.validatePartitionTransforms(table.spec()); - // Spark 2.4.x passes runId to createStreamWriter instead of real queryId, - // so we fetch it directly from sparkContext to make writes idempotent - String queryId = - lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY()); - String appId = lazySparkSession().sparkContext().applicationId(); - - return new StreamingWriter( - lazySparkSession(), table, writeConf, queryId, mode, appId, writeSchema, dsStruct); - } - - protected Table findTable(DataSourceOptions options, Configuration conf) { - Optional path = options.get("path"); - Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set"); - - if (path.get().contains("/")) { - HadoopTables tables = new HadoopTables(conf); - return tables.load(path.get()); - } else { - return CustomCatalogs.table(lazySparkSession(), path.get()); - } - } - - private SparkSession lazySparkSession() { - if (lazySpark == null) { - this.lazySpark = SparkSession.builder().getOrCreate(); - } - return lazySpark; - } - - private Configuration lazyBaseConf() { - if (lazyConf == null) { - this.lazyConf = lazySparkSession().sessionState().newHadoopConf(); - } - return lazyConf; - } - - private Table getTableAndResolveHadoopConfiguration( - DataSourceOptions options, Configuration conf) { - // Overwrite configurations from the Spark Context with configurations from the options. - mergeIcebergHadoopConfs(conf, options.asMap()); - Table table = findTable(options, conf); - // Set confs from table properties - mergeIcebergHadoopConfs(conf, table.properties()); - // Re-overwrite values set in options and table properties but were not in the environment. - mergeIcebergHadoopConfs(conf, options.asMap()); - return table; - } - - private static void mergeIcebergHadoopConfs(Configuration baseConf, Map options) { - options.keySet().stream() - .filter(key -> key.startsWith("hadoop.")) - .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key))); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java deleted file mode 100644 index 524266f6f83a..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.nio.ByteBuffer; -import java.util.function.BiFunction; -import java.util.stream.Stream; -import org.apache.iceberg.StructLike; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -/** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link - * org.apache.iceberg.PartitionKey#partition(StructLike)} - */ -class InternalRowWrapper implements StructLike { - private final DataType[] types; - private final BiFunction[] getters; - private InternalRow row = null; - - @SuppressWarnings("unchecked") - InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); - this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); - } - - InternalRowWrapper wrap(InternalRow internalRow) { - this.row = internalRow; - return this; - } - - @Override - public int size() { - return types.length; - } - - @Override - public T get(int pos, Class javaClass) { - if (row.isNullAt(pos)) { - return null; - } else if (getters[pos] != null) { - return javaClass.cast(getters[pos].apply(row, pos)); - } - - return javaClass.cast(row.get(pos, types[pos])); - } - - @Override - public void set(int pos, T value) { - row.update(pos, value); - } - - private static BiFunction getter(DataType type) { - if (type instanceof StringType) { - return (row, pos) -> row.getUTF8String(pos).toString(); - } else if (type instanceof DecimalType) { - DecimalType decimal = (DecimalType) type; - return (row, pos) -> - row.getDecimal(pos, decimal.precision(), decimal.scale()).toJavaBigDecimal(); - } else if (type instanceof BinaryType) { - return (row, pos) -> ByteBuffer.wrap(row.getBinary(pos)); - } else if (type instanceof StructType) { - StructType structType = (StructType) type; - InternalRowWrapper nestedWrapper = new InternalRowWrapper(structType); - return (row, pos) -> nestedWrapper.wrap(row.getStruct(pos, structType.size())); - } - - return null; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java deleted file mode 100644 index 94396d218304..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TableScan; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.Util; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkFilters; -import org.apache.iceberg.spark.SparkReadConf; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.TableScanUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.iceberg.util.ThreadPools; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.sources.Filter; -import org.apache.spark.sql.sources.v2.DataSourceOptions; -import org.apache.spark.sql.sources.v2.reader.DataSourceReader; -import org.apache.spark.sql.sources.v2.reader.InputPartition; -import org.apache.spark.sql.sources.v2.reader.InputPartitionReader; -import org.apache.spark.sql.sources.v2.reader.Statistics; -import org.apache.spark.sql.sources.v2.reader.SupportsPushDownFilters; -import org.apache.spark.sql.sources.v2.reader.SupportsPushDownRequiredColumns; -import org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics; -import org.apache.spark.sql.sources.v2.reader.SupportsScanColumnarBatch; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -class Reader - implements DataSourceReader, - SupportsScanColumnarBatch, - SupportsPushDownFilters, - SupportsPushDownRequiredColumns, - SupportsReportStatistics { - private static final Logger LOG = LoggerFactory.getLogger(Reader.class); - - private static final Filter[] NO_FILTERS = new Filter[0]; - private static final ImmutableSet LOCALITY_WHITELIST_FS = ImmutableSet.of("hdfs"); - - private final JavaSparkContext sparkContext; - private final Table table; - private final SparkReadConf readConf; - private final TableScan baseScan; - private StructType requestedSchema = null; - private List filterExpressions = null; - private Filter[] pushedFilters = NO_FILTERS; - private final boolean localityPreferred; - private final boolean readTimestampWithoutZone; - - // lazy variables - private Schema schema = null; - private StructType type = null; // cached because Spark accesses it multiple times - private List tasks = null; // lazy cache of tasks - private Boolean readUsingBatch = null; - private int batchSize = 0; - - Reader(SparkSession spark, Table table, boolean caseSensitive, DataSourceOptions options) { - this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - this.table = table; - this.readConf = new SparkReadConf(spark, table, options.asMap()); - - this.baseScan = configureBaseScan(caseSensitive, options); - this.schema = baseScan.schema(); - - if (table.io() instanceof HadoopFileIO) { - String fsscheme = "no_exist"; - try { - Configuration conf = SparkSession.active().sessionState().newHadoopConf(); - // merge hadoop config set on table - mergeIcebergHadoopConfs(conf, table.properties()); - // merge hadoop config passed as options and overwrite the one on table - mergeIcebergHadoopConfs(conf, options.asMap()); - FileSystem fs = new Path(table.location()).getFileSystem(conf); - fsscheme = fs.getScheme().toLowerCase(Locale.ENGLISH); - } catch (IOException ioe) { - LOG.warn("Failed to get Hadoop Filesystem", ioe); - } - String scheme = fsscheme; // Makes an effectively final version of scheme - this.localityPreferred = - options - .get("locality") - .map(Boolean::parseBoolean) - .orElseGet(() -> LOCALITY_WHITELIST_FS.contains(scheme)); - } else { - this.localityPreferred = false; - } - - this.readTimestampWithoutZone = readConf.handleTimestampWithoutZone(); - } - - private void validateOptions( - Long snapshotId, Long asOfTimestamp, Long startSnapshotId, Long endSnapshotId) { - if (snapshotId != null && asOfTimestamp != null) { - throw new IllegalArgumentException( - "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot"); - } - - if ((snapshotId != null || asOfTimestamp != null) - && (startSnapshotId != null || endSnapshotId != null)) { - throw new IllegalArgumentException( - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " - + "as-of-timestamp is specified"); - } - - if (startSnapshotId == null && endSnapshotId != null) { - throw new IllegalArgumentException( - "Cannot only specify option end-snapshot-id to do incremental scan"); - } - } - - private TableScan configureBaseScan(boolean caseSensitive, DataSourceOptions options) { - Long snapshotId = readConf.snapshotId(); - Long asOfTimestamp = readConf.asOfTimestamp(); - Long startSnapshotId = readConf.startSnapshotId(); - Long endSnapshotId = readConf.endSnapshotId(); - validateOptions(snapshotId, asOfTimestamp, startSnapshotId, endSnapshotId); - - TableScan scan = table.newScan().caseSensitive(caseSensitive); - - if (snapshotId != null) { - scan = scan.useSnapshot(snapshotId); - } - - if (asOfTimestamp != null) { - scan = scan.asOfTime(asOfTimestamp); - } - - if (startSnapshotId != null) { - if (endSnapshotId != null) { - scan = scan.appendsBetween(startSnapshotId, endSnapshotId); - } else { - scan = scan.appendsAfter(startSnapshotId); - } - } - - // look for split behavior overrides in options - Long splitSize = options.get(SparkReadOptions.SPLIT_SIZE).map(Long::parseLong).orElse(null); - if (splitSize != null) { - scan = scan.option(TableProperties.SPLIT_SIZE, splitSize.toString()); - } - - Integer splitLookback = - options.get(SparkReadOptions.LOOKBACK).map(Integer::parseInt).orElse(null); - if (splitLookback != null) { - scan = scan.option(TableProperties.SPLIT_LOOKBACK, splitLookback.toString()); - } - - Long splitOpenFileCost = - options.get(SparkReadOptions.FILE_OPEN_COST).map(Long::parseLong).orElse(null); - if (splitOpenFileCost != null) { - scan = scan.option(TableProperties.SPLIT_OPEN_FILE_COST, splitOpenFileCost.toString()); - } - - return scan; - } - - protected Schema snapshotSchema() { - return baseScan.schema(); - } - - private Schema lazySchema() { - if (schema == null) { - if (requestedSchema != null) { - // the projection should include all columns that will be returned, including those only - // used in filters - this.schema = - SparkSchemaUtil.prune( - baseScan.schema(), requestedSchema, filterExpression(), baseScan.isCaseSensitive()); - } else { - this.schema = baseScan.schema(); - } - } - return schema; - } - - private Expression filterExpression() { - if (filterExpressions != null) { - return filterExpressions.stream().reduce(Expressions.alwaysTrue(), Expressions::and); - } - return Expressions.alwaysTrue(); - } - - private StructType lazyType() { - if (type == null) { - Preconditions.checkArgument( - readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(lazySchema()), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); - this.type = SparkSchemaUtil.convert(lazySchema()); - } - return type; - } - - @Override - public StructType readSchema() { - return lazyType(); - } - - /** - * This is called in the Spark Driver when data is to be materialized into {@link ColumnarBatch} - */ - @Override - public List> planBatchInputPartitions() { - Preconditions.checkState(enableBatchRead(), "Batched reads not enabled"); - Preconditions.checkState(batchSize > 0, "Invalid batch size"); - String expectedSchemaString = SchemaParser.toJson(lazySchema()); - - ValidationException.check( - tasks().stream().noneMatch(TableScanUtil::hasDeletes), - "Cannot scan table %s: cannot apply required delete files", - table); - - // broadcast the table metadata as input partitions will be sent to executors - Broadcast

tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); - - List scanTasks = tasks(); - boolean caseSensitive = baseScan.isCaseSensitive(); - InputPartition[] readTasks = new InputPartition[scanTasks.size()]; - - Tasks.range(readTasks.length) - .stopOnFailure() - .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run( - index -> - readTasks[index] = - new ReadTask<>( - scanTasks.get(index), - tableBroadcast, - expectedSchemaString, - caseSensitive, - localityPreferred, - new BatchReaderFactory(batchSize))); - LOG.info("Batching input partitions with {} tasks.", readTasks.length); - - return Arrays.asList(readTasks); - } - - /** This is called in the Spark Driver when data is to be materialized into {@link InternalRow} */ - @Override - public List> planInputPartitions() { - String expectedSchemaString = SchemaParser.toJson(lazySchema()); - - // broadcast the table metadata as input partitions will be sent to executors - Broadcast
tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); - - List scanTasks = tasks(); - boolean caseSensitive = baseScan.isCaseSensitive(); - InputPartition[] readTasks = new InputPartition[scanTasks.size()]; - - Tasks.range(readTasks.length) - .stopOnFailure() - .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run( - index -> - readTasks[index] = - new ReadTask<>( - scanTasks.get(index), - tableBroadcast, - expectedSchemaString, - caseSensitive, - localityPreferred, - InternalRowReaderFactory.INSTANCE)); - - return Arrays.asList(readTasks); - } - - @Override - public Filter[] pushFilters(Filter[] filters) { - this.tasks = null; // invalidate cached tasks, if present - - List expressions = Lists.newArrayListWithExpectedSize(filters.length); - List pushed = Lists.newArrayListWithExpectedSize(filters.length); - - for (Filter filter : filters) { - Expression expr = SparkFilters.convert(filter); - if (expr != null) { - expressions.add(expr); - pushed.add(filter); - } - } - - this.filterExpressions = expressions; - this.pushedFilters = pushed.toArray(new Filter[0]); - - // invalidate the schema that will be projected - this.schema = null; - this.type = null; - - // Spark doesn't support residuals per task, so return all filters - // to get Spark to handle record-level filtering - return filters; - } - - @Override - public Filter[] pushedFilters() { - return pushedFilters; - } - - @Override - public void pruneColumns(StructType newRequestedSchema) { - this.requestedSchema = newRequestedSchema; - - // invalidate the schema that will be projected - this.schema = null; - this.type = null; - } - - @Override - public Statistics estimateStatistics() { - // its a fresh table, no data - if (table.currentSnapshot() == null) { - return new Stats(0L, 0L); - } - - // estimate stats using snapshot summary only for partitioned tables (metadata tables are - // unpartitioned) - if (!table.spec().isUnpartitioned() && filterExpression() == Expressions.alwaysTrue()) { - long totalRecords = - PropertyUtil.propertyAsLong( - table.currentSnapshot().summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, - Long.MAX_VALUE); - return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords); - } - - long numRows = 0L; - - for (CombinedScanTask task : tasks()) { - for (FileScanTask file : task.files()) { - // TODO: if possible, take deletes also into consideration. - double fractionOfFileScanned = ((double) file.length()) / file.file().fileSizeInBytes(); - numRows += (fractionOfFileScanned * file.file().recordCount()); - } - } - - long sizeInBytes = SparkSchemaUtil.estimateSize(lazyType(), numRows); - return new Stats(sizeInBytes, numRows); - } - - @Override - public boolean enableBatchRead() { - if (readUsingBatch == null) { - boolean allParquetFileScanTasks = - tasks().stream() - .allMatch( - combinedScanTask -> - !combinedScanTask.isDataTask() - && combinedScanTask.files().stream() - .allMatch( - fileScanTask -> - fileScanTask.file().format().equals(FileFormat.PARQUET))); - - boolean allOrcFileScanTasks = - tasks().stream() - .allMatch( - combinedScanTask -> - !combinedScanTask.isDataTask() - && combinedScanTask.files().stream() - .allMatch( - fileScanTask -> - fileScanTask.file().format().equals(FileFormat.ORC))); - - boolean atLeastOneColumn = lazySchema().columns().size() > 0; - - boolean onlyPrimitives = - lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType()); - - boolean hasNoDeleteFiles = tasks().stream().noneMatch(TableScanUtil::hasDeletes); - - boolean batchReadsEnabled = batchReadsEnabled(allParquetFileScanTasks, allOrcFileScanTasks); - - this.readUsingBatch = - batchReadsEnabled - && hasNoDeleteFiles - && (allOrcFileScanTasks - || (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); - - if (readUsingBatch) { - this.batchSize = batchSize(allParquetFileScanTasks, allOrcFileScanTasks); - } - } - return readUsingBatch; - } - - private boolean batchReadsEnabled(boolean isParquetOnly, boolean isOrcOnly) { - if (isParquetOnly) { - return readConf.parquetVectorizationEnabled(); - } else if (isOrcOnly) { - return readConf.orcVectorizationEnabled(); - } else { - return false; - } - } - - private int batchSize(boolean isParquetOnly, boolean isOrcOnly) { - if (isParquetOnly) { - return readConf.parquetBatchSize(); - } else if (isOrcOnly) { - return readConf.orcBatchSize(); - } else { - return 0; - } - } - - private static void mergeIcebergHadoopConfs(Configuration baseConf, Map options) { - options.keySet().stream() - .filter(key -> key.startsWith("hadoop.")) - .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key))); - } - - private List tasks() { - if (tasks == null) { - TableScan scan = baseScan.project(lazySchema()); - - if (filterExpressions != null) { - for (Expression filter : filterExpressions) { - scan = scan.filter(filter); - } - } - - try (CloseableIterable tasksIterable = scan.planTasks()) { - this.tasks = Lists.newArrayList(tasksIterable); - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to close table scan: %s", scan); - } - } - - return tasks; - } - - @Override - public String toString() { - return String.format( - "IcebergScan(table=%s, type=%s, filters=%s, batchedReads=%s)", - table, lazySchema().asStruct(), filterExpressions, enableBatchRead()); - } - - private static class ReadTask implements Serializable, InputPartition { - private final CombinedScanTask task; - private final Broadcast
tableBroadcast; - private final String expectedSchemaString; - private final boolean caseSensitive; - private final boolean localityPreferred; - private final ReaderFactory readerFactory; - - private transient Schema expectedSchema = null; - private transient String[] preferredLocations = null; - - private ReadTask( - CombinedScanTask task, - Broadcast
tableBroadcast, - String expectedSchemaString, - boolean caseSensitive, - boolean localityPreferred, - ReaderFactory readerFactory) { - this.task = task; - this.tableBroadcast = tableBroadcast; - this.expectedSchemaString = expectedSchemaString; - this.caseSensitive = caseSensitive; - this.localityPreferred = localityPreferred; - this.preferredLocations = getPreferredLocations(); - this.readerFactory = readerFactory; - } - - @Override - public InputPartitionReader createPartitionReader() { - Table table = tableBroadcast.value(); - return readerFactory.create(task, table, lazyExpectedSchema(), caseSensitive); - } - - @Override - public String[] preferredLocations() { - return preferredLocations; - } - - private Schema lazyExpectedSchema() { - if (expectedSchema == null) { - this.expectedSchema = SchemaParser.fromJson(expectedSchemaString); - } - return expectedSchema; - } - - @SuppressWarnings("checkstyle:RegexpSingleline") - private String[] getPreferredLocations() { - if (!localityPreferred) { - return new String[0]; - } - - Configuration conf = SparkSession.active().sparkContext().hadoopConfiguration(); - return Util.blockLocations(task, conf); - } - } - - private interface ReaderFactory extends Serializable { - InputPartitionReader create( - CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive); - } - - private static class InternalRowReaderFactory implements ReaderFactory { - private static final InternalRowReaderFactory INSTANCE = new InternalRowReaderFactory(); - - private InternalRowReaderFactory() {} - - @Override - public InputPartitionReader create( - CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { - return new RowReader(task, table, expectedSchema, caseSensitive); - } - } - - private static class BatchReaderFactory implements ReaderFactory { - private final int batchSize; - - BatchReaderFactory(int batchSize) { - this.batchSize = batchSize; - } - - @Override - public InputPartitionReader create( - CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { - return new BatchReader(task, table, expectedSchema, caseSensitive, batchSize); - } - } - - private static class RowReader extends RowDataReader - implements InputPartitionReader { - RowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { - super(task, table, expectedSchema, caseSensitive); - } - } - - private static class BatchReader extends BatchDataReader - implements InputPartitionReader { - BatchReader( - CombinedScanTask task, - Table table, - Schema expectedSchema, - boolean caseSensitive, - int size) { - super(task, table, expectedSchema, caseSensitive, size); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java deleted file mode 100644 index f206149da30e..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataTask; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.DeleteFilter; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.SparkAvroReader; -import org.apache.iceberg.spark.data.SparkOrcReader; -import org.apache.iceberg.spark.data.SparkParquetReaders; -import org.apache.iceberg.types.TypeUtil; -import org.apache.spark.rdd.InputFileBlockHolder; -import org.apache.spark.sql.catalyst.InternalRow; - -class RowDataReader extends BaseDataReader { - - private final Schema tableSchema; - private final Schema expectedSchema; - private final String nameMapping; - private final boolean caseSensitive; - - RowDataReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { - super(table, task); - this.tableSchema = table.schema(); - this.expectedSchema = expectedSchema; - this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - this.caseSensitive = caseSensitive; - } - - @Override - CloseableIterator open(FileScanTask task) { - SparkDeleteFilter deletes = new SparkDeleteFilter(task, tableSchema, expectedSchema); - - // schema or rows returned by readers - Schema requiredSchema = deletes.requiredSchema(); - Map idToConstant = constantsMap(task, expectedSchema); - DataFile file = task.file(); - - // update the current file for Spark's filename() function - InputFileBlockHolder.set(file.path().toString(), task.start(), task.length()); - - return deletes.filter(open(task, requiredSchema, idToConstant)).iterator(); - } - - protected Schema tableSchema() { - return tableSchema; - } - - protected CloseableIterable open( - FileScanTask task, Schema readSchema, Map idToConstant) { - CloseableIterable iter; - if (task.isDataTask()) { - iter = newDataIterable(task.asDataTask(), readSchema); - } else { - InputFile location = getInputFile(task); - Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); - - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(location, task, readSchema, idToConstant); - break; - - case AVRO: - iter = newAvroIterable(location, task, readSchema, idToConstant); - break; - - case ORC: - iter = newOrcIterable(location, task, readSchema, idToConstant); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); - } - } - - return iter; - } - - private CloseableIterable newAvroIterable( - InputFile location, FileScanTask task, Schema projection, Map idToConstant) { - Avro.ReadBuilder builder = - Avro.read(location) - .reuseContainers() - .project(projection) - .split(task.start(), task.length()) - .createReaderFunc( - readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { - Parquet.ReadBuilder builder = - Parquet.read(location) - .reuseContainers() - .split(task.start(), task.length()) - .project(readSchema) - .createReaderFunc( - fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(location) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newDataIterable(DataTask task, Schema readSchema) { - StructInternalRow row = new StructInternalRow(readSchema.asStruct()); - CloseableIterable asSparkRows = - CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); - return asSparkRows; - } - - protected class SparkDeleteFilter extends DeleteFilter { - private final InternalRowWrapper asStructLike; - - SparkDeleteFilter(FileScanTask task, Schema tableSchema, Schema requestedSchema) { - super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); - this.asStructLike = new InternalRowWrapper(SparkSchemaUtil.convert(requiredSchema())); - } - - @Override - protected StructLike asStructLike(InternalRow row) { - return asStructLike.wrap(row); - } - - @Override - protected InputFile getInputFile(String location) { - return RowDataReader.this.getInputFile(location); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java deleted file mode 100644 index b5022cc9882a..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.Serializable; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.TaskWriter; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.spark.TaskContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class RowDataRewriter implements Serializable { - - private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); - - private final Broadcast
tableBroadcast; - private final PartitionSpec spec; - private final FileFormat format; - private final boolean caseSensitive; - - public RowDataRewriter( - Broadcast
tableBroadcast, PartitionSpec spec, boolean caseSensitive) { - this.tableBroadcast = tableBroadcast; - this.spec = spec; - this.caseSensitive = caseSensitive; - - Table table = tableBroadcast.value(); - String formatString = - table - .properties() - .getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); - this.format = FileFormat.fromString(formatString); - } - - public List rewriteDataForTasks(JavaRDD taskRDD) { - JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - - return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); - } - - private List rewriteDataForTask(CombinedScanTask task) throws Exception { - TaskContext context = TaskContext.get(); - int partitionId = context.partitionId(); - long taskId = context.taskAttemptId(); - - Table table = tableBroadcast.value(); - Schema schema = table.schema(); - Map properties = table.properties(); - - RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); - - StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = - SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); - OutputFileFactory fileFactory = - OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); - - TaskWriter writer; - if (spec.isUnpartitioned()) { - writer = - new UnpartitionedWriter<>( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean( - properties, - TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, - TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = - new SparkPartitionedFanoutWriter( - spec, - format, - appenderFactory, - fileFactory, - table.io(), - Long.MAX_VALUE, - schema, - structType); - } else { - writer = - new SparkPartitionedWriter( - spec, - format, - appenderFactory, - fileFactory, - table.io(), - Long.MAX_VALUE, - schema, - structType); - } - - try { - while (dataReader.next()) { - InternalRow row = dataReader.get(); - writer.write(row); - } - - dataReader.close(); - dataReader = null; - - writer.close(); - return Lists.newArrayList(writer.dataFiles()); - - } catch (Throwable originalThrowable) { - try { - LOG.error("Aborting task", originalThrowable); - context.markTaskFailed(originalThrowable); - - LOG.error( - "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, - taskId, - context.attemptNumber(), - context.stageId(), - context.stageAttemptNumber()); - if (dataReader != null) { - dataReader.close(); - } - writer.abort(); - LOG.error( - "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, - taskId, - context.taskAttemptId(), - context.stageId(), - context.stageAttemptNumber()); - - } catch (Throwable inner) { - if (originalThrowable != inner) { - originalThrowable.addSuppressed(inner); - LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); - } - } - - if (originalThrowable instanceof Exception) { - throw originalThrowable; - } else { - throw new RuntimeException(originalThrowable); - } - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java deleted file mode 100644 index 6372edde0782..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Map; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.SparkAvroWriter; -import org.apache.iceberg.spark.data.SparkOrcWriter; -import org.apache.iceberg.spark.data.SparkParquetWriters; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -class SparkAppenderFactory implements FileAppenderFactory { - private final Map properties; - private final Schema writeSchema; - private final StructType dsSchema; - private final PartitionSpec spec; - private final int[] equalityFieldIds; - private final Schema eqDeleteRowSchema; - private final Schema posDeleteRowSchema; - - private StructType eqDeleteSparkType = null; - private StructType posDeleteSparkType = null; - - SparkAppenderFactory( - Map properties, - Schema writeSchema, - StructType dsSchema, - PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { - this.properties = properties; - this.writeSchema = writeSchema; - this.dsSchema = dsSchema; - this.spec = spec; - this.equalityFieldIds = equalityFieldIds; - this.eqDeleteRowSchema = eqDeleteRowSchema; - this.posDeleteRowSchema = posDeleteRowSchema; - } - - static Builder builderFor(Table table, Schema writeSchema, StructType dsSchema) { - return new Builder(table, writeSchema, dsSchema); - } - - static class Builder { - private final Table table; - private final Schema writeSchema; - private final StructType dsSchema; - private PartitionSpec spec; - private int[] equalityFieldIds; - private Schema eqDeleteRowSchema; - private Schema posDeleteRowSchema; - - Builder(Table table, Schema writeSchema, StructType dsSchema) { - this.table = table; - this.spec = table.spec(); - this.writeSchema = writeSchema; - this.dsSchema = dsSchema; - } - - Builder spec(PartitionSpec newSpec) { - this.spec = newSpec; - return this; - } - - Builder equalityFieldIds(int[] newEqualityFieldIds) { - this.equalityFieldIds = newEqualityFieldIds; - return this; - } - - Builder eqDeleteRowSchema(Schema newEqDeleteRowSchema) { - this.eqDeleteRowSchema = newEqDeleteRowSchema; - return this; - } - - Builder posDelRowSchema(Schema newPosDelRowSchema) { - this.posDeleteRowSchema = newPosDelRowSchema; - return this; - } - - SparkAppenderFactory build() { - Preconditions.checkNotNull(table, "Table must not be null"); - Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); - Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); - if (equalityFieldIds != null) { - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); - } - if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull( - equalityFieldIds, - "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); - } - - return new SparkAppenderFactory( - table.properties(), - writeSchema, - dsSchema, - spec, - equalityFieldIds, - eqDeleteRowSchema, - posDeleteRowSchema); - } - } - - private StructType lazyEqDeleteSparkType() { - if (eqDeleteSparkType == null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null"); - this.eqDeleteSparkType = SparkSchemaUtil.convert(eqDeleteRowSchema); - } - return eqDeleteSparkType; - } - - private StructType lazyPosDeleteSparkType() { - if (posDeleteSparkType == null) { - Preconditions.checkNotNull( - posDeleteRowSchema, "Position delete row schema shouldn't be null"); - this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); - } - return posDeleteSparkType; - } - - @Override - public FileAppender newAppender(OutputFile file, FileFormat fileFormat) { - MetricsConfig metricsConfig = MetricsConfig.fromProperties(properties); - try { - switch (fileFormat) { - case PARQUET: - return Parquet.write(file) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dsSchema, msgType)) - .setAll(properties) - .metricsConfig(metricsConfig) - .schema(writeSchema) - .overwrite() - .build(); - - case AVRO: - return Avro.write(file) - .createWriterFunc(ignored -> new SparkAvroWriter(dsSchema)) - .setAll(properties) - .schema(writeSchema) - .overwrite() - .build(); - - case ORC: - return ORC.write(file) - .createWriterFunc(SparkOrcWriter::new) - .setAll(properties) - .metricsConfig(metricsConfig) - .schema(writeSchema) - .overwrite() - .build(); - - default: - throw new UnsupportedOperationException("Cannot write unknown format: " + fileFormat); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - @Override - public DataWriter newDataWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), - format, - file.encryptingOutputFile().location(), - spec, - partition, - file.keyMetadata()); - } - - @Override - public EqualityDeleteWriter newEqDeleteWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - Preconditions.checkState( - equalityFieldIds != null && equalityFieldIds.length > 0, - "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull( - eqDeleteRowSchema, - "Equality delete row schema shouldn't be null when creating equality-delete writer"); - - try { - switch (format) { - case PARQUET: - return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) - .overwrite() - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .equalityFieldIds(equalityFieldIds) - .withKeyMetadata(file.keyMetadata()) - .buildEqualityWriter(); - - case AVRO: - return Avro.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(ignored -> new SparkAvroWriter(lazyEqDeleteSparkType())) - .overwrite() - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .equalityFieldIds(equalityFieldIds) - .withKeyMetadata(file.keyMetadata()) - .buildEqualityWriter(); - - case ORC: - return ORC.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(SparkOrcWriter::new) - .overwrite() - .rowSchema(eqDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .equalityFieldIds(equalityFieldIds) - .withKeyMetadata(file.keyMetadata()) - .buildEqualityWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write equality-deletes for unsupported file format: " + format); - } - } catch (IOException e) { - throw new UncheckedIOException("Failed to create new equality delete writer", e); - } - } - - @Override - public PositionDeleteWriter newPosDeleteWriter( - EncryptedOutputFile file, FileFormat format, StructLike partition) { - try { - switch (format) { - case PARQUET: - StructType sparkPosDeleteSchema = - SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); - return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) - .overwrite() - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(file.keyMetadata()) - .transformPaths(path -> UTF8String.fromString(path.toString())) - .buildPositionWriter(); - - case AVRO: - return Avro.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(ignored -> new SparkAvroWriter(lazyPosDeleteSparkType())) - .overwrite() - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(file.keyMetadata()) - .buildPositionWriter(); - - case ORC: - return ORC.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(SparkOrcWriter::new) - .overwrite() - .rowSchema(posDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(file.keyMetadata()) - .transformPaths(path -> UTF8String.fromString(path.toString())) - .buildPositionWriter(); - - default: - throw new UnsupportedOperationException( - "Cannot write pos-deletes for unsupported file format: " + format); - } - - } catch (IOException e) { - throw new UncheckedIOException("Failed to create new equality delete writer", e); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java deleted file mode 100644 index 26c14b3f8346..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - -import java.util.Map; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; -import org.apache.iceberg.io.DeleteSchemaUtil; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.SparkAvroWriter; -import org.apache.iceberg.spark.data.SparkOrcWriter; -import org.apache.iceberg.spark.data.SparkParquetWriters; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -class SparkFileWriterFactory extends BaseFileWriterFactory { - private StructType dataSparkType; - private StructType equalityDeleteSparkType; - private StructType positionDeleteSparkType; - - SparkFileWriterFactory( - Table table, - FileFormat dataFileFormat, - Schema dataSchema, - StructType dataSparkType, - SortOrder dataSortOrder, - FileFormat deleteFileFormat, - int[] equalityFieldIds, - Schema equalityDeleteRowSchema, - StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, - Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super( - table, - dataFileFormat, - dataSchema, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteSortOrder, - positionDeleteRowSchema); - - this.dataSparkType = dataSparkType; - this.equalityDeleteSparkType = equalityDeleteSparkType; - this.positionDeleteSparkType = positionDeleteSparkType; - } - - static Builder builderFor(Table table) { - return new Builder(table); - } - - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignored -> new SparkAvroWriter(dataSparkType())); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new SparkAvroWriter(equalityDeleteSparkType())); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = - positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); - if (withRow) { - // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos - StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); - StructType positionDeleteRowSparkType = (StructType) rowField.dataType(); - builder.createWriterFunc(ignored -> new SparkAvroWriter(positionDeleteRowSparkType)); - } - } - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dataSparkType(), msgType)); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); - builder.transformPaths(path -> UTF8String.fromString(path.toString())); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - } - - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - builder.transformPaths(path -> UTF8String.fromString(path.toString())); - } - - private StructType dataSparkType() { - if (dataSparkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataSparkType = SparkSchemaUtil.convert(dataSchema()); - } - - return dataSparkType; - } - - private StructType equalityDeleteSparkType() { - if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteSparkType; - } - - private StructType positionDeleteSparkType() { - if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and - // position - Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); - this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); - } - - return positionDeleteSparkType; - } - - static class Builder { - private final Table table; - private FileFormat dataFileFormat; - private Schema dataSchema; - private StructType dataSparkType; - private SortOrder dataSortOrder; - private FileFormat deleteFileFormat; - private int[] equalityFieldIds; - private Schema equalityDeleteRowSchema; - private StructType equalityDeleteSparkType; - private SortOrder equalityDeleteSortOrder; - private Schema positionDeleteRowSchema; - private StructType positionDeleteSparkType; - - Builder(Table table) { - this.table = table; - - Map properties = table.properties(); - - String dataFileFormatName = - properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); - this.dataFileFormat = FileFormat.fromString(dataFileFormatName); - - String deleteFileFormatName = - properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); - this.deleteFileFormat = FileFormat.fromString(deleteFileFormatName); - } - - Builder dataFileFormat(FileFormat newDataFileFormat) { - this.dataFileFormat = newDataFileFormat; - return this; - } - - Builder dataSchema(Schema newDataSchema) { - this.dataSchema = newDataSchema; - return this; - } - - Builder dataSparkType(StructType newDataSparkType) { - this.dataSparkType = newDataSparkType; - return this; - } - - Builder dataSortOrder(SortOrder newDataSortOrder) { - this.dataSortOrder = newDataSortOrder; - return this; - } - - Builder deleteFileFormat(FileFormat newDeleteFileFormat) { - this.deleteFileFormat = newDeleteFileFormat; - return this; - } - - Builder equalityFieldIds(int[] newEqualityFieldIds) { - this.equalityFieldIds = newEqualityFieldIds; - return this; - } - - Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { - this.equalityDeleteRowSchema = newEqualityDeleteRowSchema; - return this; - } - - Builder equalityDeleteSparkType(StructType newEqualityDeleteSparkType) { - this.equalityDeleteSparkType = newEqualityDeleteSparkType; - return this; - } - - Builder equalityDeleteSortOrder(SortOrder newEqualityDeleteSortOrder) { - this.equalityDeleteSortOrder = newEqualityDeleteSortOrder; - return this; - } - - Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { - this.positionDeleteRowSchema = newPositionDeleteRowSchema; - return this; - } - - Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { - this.positionDeleteSparkType = newPositionDeleteSparkType; - return this; - } - - SparkFileWriterFactory build() { - boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; - boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument( - noEqualityDeleteConf || fullEqualityDeleteConf, - "Equality field IDs and equality delete row schema must be set together"); - - return new SparkFileWriterFactory( - table, - dataFileFormat, - dataSchema, - dataSparkType, - dataSortOrder, - deleteFileFormat, - equalityFieldIds, - equalityDeleteRowSchema, - equalityDeleteSparkType, - equalityDeleteSortOrder, - positionDeleteRowSchema, - positionDeleteSparkType); - } - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java deleted file mode 100644 index f17cd260f928..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedFanoutWriter; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter { - private final PartitionKey partitionKey; - private final InternalRowWrapper internalRowWrapper; - - public SparkPartitionedFanoutWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - StructType sparkSchema) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.partitionKey = new PartitionKey(spec, schema); - this.internalRowWrapper = new InternalRowWrapper(sparkSchema); - } - - @Override - protected PartitionKey partition(InternalRow row) { - partitionKey.partition(internalRowWrapper.wrap(row)); - return partitionKey; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java deleted file mode 100644 index a86091644360..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionKey; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.PartitionedWriter; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -public class SparkPartitionedWriter extends PartitionedWriter { - private final PartitionKey partitionKey; - private final InternalRowWrapper internalRowWrapper; - - public SparkPartitionedWriter( - PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - StructType sparkSchema) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize); - this.partitionKey = new PartitionKey(spec, schema); - this.internalRowWrapper = new InternalRowWrapper(sparkSchema); - } - - @Override - protected PartitionKey partition(InternalRow row) { - partitionKey.partition(internalRowWrapper.wrap(row)); - return partitionKey; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java deleted file mode 100644 index c4142011b294..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.OptionalLong; -import org.apache.spark.sql.sources.v2.reader.Statistics; - -class Stats implements Statistics { - private final OptionalLong sizeInBytes; - private final OptionalLong numRows; - - Stats(long sizeInBytes, long numRows) { - this.sizeInBytes = OptionalLong.of(sizeInBytes); - this.numRows = OptionalLong.of(numRows); - } - - @Override - public OptionalLong sizeInBytes() { - return sizeInBytes; - } - - @Override - public OptionalLong numRows() { - return numRows; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java deleted file mode 100644 index 9fa4c63dda2b..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.databind.JsonNode; -import java.io.IOException; -import java.io.StringWriter; -import java.io.UncheckedIOException; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.JsonUtil; -import org.apache.spark.sql.sources.v2.reader.streaming.Offset; - -class StreamingOffset extends Offset { - static final StreamingOffset START_OFFSET = new StreamingOffset(-1L, -1, false); - - private static final int CURR_VERSION = 1; - private static final String VERSION = "version"; - private static final String SNAPSHOT_ID = "snapshot_id"; - private static final String POSITION = "position"; - private static final String SCAN_ALL_FILES = "scan_all_files"; - - private final long snapshotId; - private final long position; - private final boolean scanAllFiles; - - /** - * An implementation of Spark Structured Streaming Offset, to track the current processed files of - * Iceberg table. - * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when - * starting a stream. - */ - StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { - this.snapshotId = snapshotId; - this.position = position; - this.scanAllFiles = scanAllFiles; - } - - static StreamingOffset fromJson(String json) { - Preconditions.checkNotNull(json, "Cannot parse StreamingOffset JSON: null"); - - try { - JsonNode node = JsonUtil.mapper().readValue(json, JsonNode.class); - // The version of StreamingOffset. The offset was created with a version number - // used to validate when deserializing from json string. - int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument( - version == CURR_VERSION, - "Cannot parse offset JSON: offset version %s is not supported", - version); - - long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); - int position = JsonUtil.getInt(POSITION, node); - boolean shouldScanAllFiles = JsonUtil.getBool(SCAN_ALL_FILES, node); - - return new StreamingOffset(snapshotId, position, shouldScanAllFiles); - } catch (IOException e) { - throw new IllegalArgumentException( - String.format("Failed to parse StreamingOffset from JSON string %s", json), e); - } - } - - @Override - public String json() { - StringWriter writer = new StringWriter(); - try { - JsonGenerator generator = JsonUtil.factory().createGenerator(writer); - generator.writeStartObject(); - generator.writeNumberField(VERSION, CURR_VERSION); - generator.writeNumberField(SNAPSHOT_ID, snapshotId); - generator.writeNumberField(POSITION, position); - generator.writeBooleanField(SCAN_ALL_FILES, scanAllFiles); - generator.writeEndObject(); - generator.flush(); - - } catch (IOException e) { - throw new UncheckedIOException("Failed to write StreamingOffset to json", e); - } - - return writer.toString(); - } - - long snapshotId() { - return snapshotId; - } - - long position() { - return position; - } - - boolean shouldScanAllFiles() { - return scanAllFiles; - } - - @Override - public boolean equals(Object obj) { - if (obj instanceof StreamingOffset) { - StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId - && offset.position == position - && offset.scanAllFiles == scanAllFiles; - } else { - return false; - } - } - - @Override - public int hashCode() { - return Objects.hashCode(snapshotId, position, scanAllFiles); - } - - @Override - public String toString() { - return String.format( - "Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java deleted file mode 100644 index 64ad7e672866..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.OverwriteFiles; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.spark.SparkWriteConf; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; -import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter; -import org.apache.spark.sql.streaming.OutputMode; -import org.apache.spark.sql.types.StructType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class StreamingWriter extends Writer implements StreamWriter { - - private static final Logger LOG = LoggerFactory.getLogger(StreamingWriter.class); - private static final String QUERY_ID_PROPERTY = "spark.sql.streaming.queryId"; - private static final String EPOCH_ID_PROPERTY = "spark.sql.streaming.epochId"; - - private final String queryId; - private final OutputMode mode; - - StreamingWriter( - SparkSession spark, - Table table, - SparkWriteConf writeConf, - String queryId, - OutputMode mode, - String applicationId, - Schema writeSchema, - StructType dsSchema) { - super(spark, table, writeConf, false, applicationId, writeSchema, dsSchema); - this.queryId = queryId; - this.mode = mode; - } - - @Override - public void commit(long epochId, WriterCommitMessage[] messages) { - LOG.info("Committing epoch {} for query {} in {} mode", epochId, queryId, mode); - - table().refresh(); - Long lastCommittedEpochId = getLastCommittedEpochId(); - if (lastCommittedEpochId != null && epochId <= lastCommittedEpochId) { - LOG.info("Skipping epoch {} for query {} as it was already committed", epochId, queryId); - return; - } - - if (mode == OutputMode.Complete()) { - OverwriteFiles overwriteFiles = table().newOverwrite(); - overwriteFiles.overwriteByRowFilter(Expressions.alwaysTrue()); - int numFiles = 0; - for (DataFile file : files(messages)) { - overwriteFiles.addFile(file); - numFiles++; - } - commit(overwriteFiles, epochId, numFiles, "streaming complete overwrite"); - } else { - AppendFiles append = table().newFastAppend(); - int numFiles = 0; - for (DataFile file : files(messages)) { - append.appendFile(file); - numFiles++; - } - commit(append, epochId, numFiles, "streaming append"); - } - } - - private void commit( - SnapshotUpdate snapshotUpdate, long epochId, int numFiles, String description) { - snapshotUpdate.set(QUERY_ID_PROPERTY, queryId); - snapshotUpdate.set(EPOCH_ID_PROPERTY, Long.toString(epochId)); - commitOperation(snapshotUpdate, numFiles, description); - } - - @Override - public void abort(long epochId, WriterCommitMessage[] messages) { - abort(messages); - } - - private Long getLastCommittedEpochId() { - Snapshot snapshot = table().currentSnapshot(); - Long lastCommittedEpochId = null; - while (snapshot != null) { - Map summary = snapshot.summary(); - String snapshotQueryId = summary.get(QUERY_ID_PROPERTY); - if (queryId.equals(snapshotQueryId)) { - lastCommittedEpochId = Long.valueOf(summary.get(EPOCH_ID_PROPERTY)); - break; - } - Long parentSnapshotId = snapshot.parentId(); - snapshot = parentSnapshotId != null ? table().snapshot(parentSnapshotId) : null; - } - return lastCommittedEpochId; - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java deleted file mode 100644 index 3c7ebabeab3d..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDate; -import java.time.OffsetDateTime; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.function.BiConsumer; -import java.util.function.Function; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.GenericArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.BooleanType; -import org.apache.spark.sql.types.ByteType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DateType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.DoubleType; -import org.apache.spark.sql.types.FloatType; -import org.apache.spark.sql.types.IntegerType; -import org.apache.spark.sql.types.LongType; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.ShortType; -import org.apache.spark.sql.types.StringType; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.types.TimestampType; -import org.apache.spark.unsafe.types.CalendarInterval; -import org.apache.spark.unsafe.types.UTF8String; - -class StructInternalRow extends InternalRow { - private final Types.StructType type; - private StructLike struct; - - StructInternalRow(Types.StructType type) { - this.type = type; - } - - private StructInternalRow(Types.StructType type, StructLike struct) { - this.type = type; - this.struct = struct; - } - - public StructInternalRow setStruct(StructLike newStruct) { - this.struct = newStruct; - return this; - } - - @Override - public int numFields() { - return struct.size(); - } - - @Override - public void setNullAt(int i) { - throw new UnsupportedOperationException("StructInternalRow is read-only"); - } - - @Override - public void update(int i, Object value) { - throw new UnsupportedOperationException("StructInternalRow is read-only"); - } - - @Override - public InternalRow copy() { - return this; - } - - @Override - public boolean isNullAt(int ordinal) { - return struct.get(ordinal, Object.class) == null; - } - - @Override - public boolean getBoolean(int ordinal) { - return struct.get(ordinal, Boolean.class); - } - - @Override - public byte getByte(int ordinal) { - return (byte) (int) struct.get(ordinal, Integer.class); - } - - @Override - public short getShort(int ordinal) { - return (short) (int) struct.get(ordinal, Integer.class); - } - - @Override - public int getInt(int ordinal) { - Object integer = struct.get(ordinal, Object.class); - - if (integer instanceof Integer) { - return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); - } else { - throw new IllegalStateException( - "Unknown type for int field. Type name: " + integer.getClass().getName()); - } - } - - @Override - public long getLong(int ordinal) { - Object longVal = struct.get(ordinal, Object.class); - - if (longVal instanceof Long) { - return (long) longVal; - } else if (longVal instanceof OffsetDateTime) { - return Duration.between(Instant.EPOCH, (OffsetDateTime) longVal).toNanos() / 1000; - } else if (longVal instanceof LocalDate) { - return ((LocalDate) longVal).toEpochDay(); - } else { - throw new IllegalStateException( - "Unknown type for long field. Type name: " + longVal.getClass().getName()); - } - } - - @Override - public float getFloat(int ordinal) { - return struct.get(ordinal, Float.class); - } - - @Override - public double getDouble(int ordinal) { - return struct.get(ordinal, Double.class); - } - - @Override - public Decimal getDecimal(int ordinal, int precision, int scale) { - return isNullAt(ordinal) ? null : getDecimalInternal(ordinal, precision, scale); - } - - private Decimal getDecimalInternal(int ordinal, int precision, int scale) { - return Decimal.apply(struct.get(ordinal, BigDecimal.class)); - } - - @Override - public UTF8String getUTF8String(int ordinal) { - return isNullAt(ordinal) ? null : getUTF8StringInternal(ordinal); - } - - private UTF8String getUTF8StringInternal(int ordinal) { - CharSequence seq = struct.get(ordinal, CharSequence.class); - return UTF8String.fromString(seq.toString()); - } - - @Override - public byte[] getBinary(int ordinal) { - return isNullAt(ordinal) ? null : getBinaryInternal(ordinal); - } - - private byte[] getBinaryInternal(int ordinal) { - Object bytes = struct.get(ordinal, Object.class); - - // should only be either ByteBuffer or byte[] - if (bytes instanceof ByteBuffer) { - return ByteBuffers.toByteArray((ByteBuffer) bytes); - } else if (bytes instanceof byte[]) { - return (byte[]) bytes; - } else { - throw new IllegalStateException( - "Unknown type for binary field. Type name: " + bytes.getClass().getName()); - } - } - - @Override - public CalendarInterval getInterval(int ordinal) { - throw new UnsupportedOperationException("Unsupported type: interval"); - } - - @Override - public InternalRow getStruct(int ordinal, int numFields) { - return isNullAt(ordinal) ? null : getStructInternal(ordinal, numFields); - } - - private InternalRow getStructInternal(int ordinal, int numFields) { - return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); - } - - @Override - public ArrayData getArray(int ordinal) { - return isNullAt(ordinal) ? null : getArrayInternal(ordinal); - } - - private ArrayData getArrayInternal(int ordinal) { - return collectionToArrayData( - type.fields().get(ordinal).type().asListType().elementType(), - struct.get(ordinal, Collection.class)); - } - - @Override - public MapData getMap(int ordinal) { - return isNullAt(ordinal) ? null : getMapInternal(ordinal); - } - - private MapData getMapInternal(int ordinal) { - return mapToMapData( - type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); - } - - @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") - public Object get(int ordinal, DataType dataType) { - if (isNullAt(ordinal)) { - return null; - } - - if (dataType instanceof IntegerType) { - return getInt(ordinal); - } else if (dataType instanceof LongType) { - return getLong(ordinal); - } else if (dataType instanceof StringType) { - return getUTF8StringInternal(ordinal); - } else if (dataType instanceof FloatType) { - return getFloat(ordinal); - } else if (dataType instanceof DoubleType) { - return getDouble(ordinal); - } else if (dataType instanceof DecimalType) { - DecimalType decimalType = (DecimalType) dataType; - return getDecimalInternal(ordinal, decimalType.precision(), decimalType.scale()); - } else if (dataType instanceof BinaryType) { - return getBinaryInternal(ordinal); - } else if (dataType instanceof StructType) { - return getStructInternal(ordinal, ((StructType) dataType).size()); - } else if (dataType instanceof ArrayType) { - return getArrayInternal(ordinal); - } else if (dataType instanceof MapType) { - return getMapInternal(ordinal); - } else if (dataType instanceof BooleanType) { - return getBoolean(ordinal); - } else if (dataType instanceof ByteType) { - return getByte(ordinal); - } else if (dataType instanceof ShortType) { - return getShort(ordinal); - } else if (dataType instanceof DateType) { - return getInt(ordinal); - } else if (dataType instanceof TimestampType) { - return getLong(ordinal); - } - return null; - } - - private MapData mapToMapData(Types.MapType mapType, Map map) { - // make a defensive copy to ensure entries do not change - List> entries = ImmutableList.copyOf(map.entrySet()); - return new ArrayBasedMapData( - collectionToArrayData(mapType.keyType(), Lists.transform(entries, Map.Entry::getKey)), - collectionToArrayData(mapType.valueType(), Lists.transform(entries, Map.Entry::getValue))); - } - - private ArrayData collectionToArrayData(Type elementType, Collection values) { - switch (elementType.typeId()) { - case BOOLEAN: - case INTEGER: - case DATE: - case TIME: - case LONG: - case TIMESTAMP: - case FLOAT: - case DOUBLE: - return fillArray(values, array -> (pos, value) -> array[pos] = value); - case STRING: - return fillArray( - values, - array -> - (BiConsumer) - (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); - case FIXED: - case BINARY: - return fillArray( - values, - array -> - (BiConsumer) - (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); - case DECIMAL: - return fillArray( - values, - array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); - case STRUCT: - return fillArray( - values, - array -> - (BiConsumer) - (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); - case LIST: - return fillArray( - values, - array -> - (BiConsumer>) - (pos, list) -> - array[pos] = - collectionToArrayData(elementType.asListType().elementType(), list)); - case MAP: - return fillArray( - values, - array -> - (BiConsumer>) - (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); - default: - throw new UnsupportedOperationException("Unsupported array element type: " + elementType); - } - } - - @SuppressWarnings("unchecked") - private GenericArrayData fillArray( - Collection values, Function> makeSetter) { - Object[] array = new Object[values.size()]; - BiConsumer setter = makeSetter.apply(array); - - int index = 0; - for (Object value : values) { - if (value == null) { - array[index] = null; - } else { - setter.accept(index, (T) value); - } - - index += 1; - } - - return new GenericArrayData(array); - } -} diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java deleted file mode 100644 index 721603a9009b..000000000000 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Map; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.ReplacePartitions; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SerializableTable; -import org.apache.iceberg.SnapshotSummary; -import org.apache.iceberg.SnapshotUpdate; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.exceptions.CommitStateUnknownException; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.io.UnpartitionedWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.spark.SparkWriteConf; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.Tasks; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.sources.v2.writer.DataSourceWriter; -import org.apache.spark.sql.sources.v2.writer.DataWriter; -import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; -import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; -import org.apache.spark.sql.types.StructType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// TODO: parameterize DataSourceWriter with subclass of WriterCommitMessage -class Writer implements DataSourceWriter { - private static final Logger LOG = LoggerFactory.getLogger(Writer.class); - - private final JavaSparkContext sparkContext; - private final Table table; - private final FileFormat format; - private final boolean replacePartitions; - private final String applicationId; - private final String wapId; - private final long targetFileSize; - private final Schema writeSchema; - private final StructType dsSchema; - private final Map extraSnapshotMetadata; - private final boolean partitionedFanoutEnabled; - - private boolean cleanupOnAbort = true; - - Writer( - SparkSession spark, - Table table, - SparkWriteConf writeConf, - boolean replacePartitions, - String applicationId, - Schema writeSchema, - StructType dsSchema) { - this(spark, table, writeConf, replacePartitions, applicationId, null, writeSchema, dsSchema); - } - - Writer( - SparkSession spark, - Table table, - SparkWriteConf writeConf, - boolean replacePartitions, - String applicationId, - String wapId, - Schema writeSchema, - StructType dsSchema) { - this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - this.table = table; - this.format = writeConf.dataFileFormat(); - this.replacePartitions = replacePartitions; - this.applicationId = applicationId; - this.wapId = wapId; - this.targetFileSize = writeConf.targetDataFileSize(); - this.writeSchema = writeSchema; - this.dsSchema = dsSchema; - this.extraSnapshotMetadata = writeConf.extraSnapshotMetadata(); - this.partitionedFanoutEnabled = writeConf.fanoutWriterEnabled(); - } - - private boolean isWapTable() { - return Boolean.parseBoolean( - table - .properties() - .getOrDefault( - TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, - TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); - } - - @Override - public DataWriterFactory createWriterFactory() { - // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); - return new WriterFactory( - tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); - } - - @Override - public void commit(WriterCommitMessage[] messages) { - if (replacePartitions) { - replacePartitions(messages); - } else { - append(messages); - } - } - - protected void commitOperation(SnapshotUpdate operation, int numFiles, String description) { - LOG.info("Committing {} with {} files to table {}", description, numFiles, table); - if (applicationId != null) { - operation.set("spark.app.id", applicationId); - } - - if (!extraSnapshotMetadata.isEmpty()) { - extraSnapshotMetadata.forEach(operation::set); - } - - if (isWapTable() && wapId != null) { - // write-audit-publish is enabled for this table and job - // stage the changes without changing the current snapshot - operation.set(SnapshotSummary.STAGED_WAP_ID_PROP, wapId); - operation.stageOnly(); - } - - try { - long start = System.currentTimeMillis(); - operation.commit(); // abort is automatically called if this fails - long duration = System.currentTimeMillis() - start; - LOG.info("Committed in {} ms", duration); - } catch (CommitStateUnknownException commitStateUnknownException) { - cleanupOnAbort = false; - throw commitStateUnknownException; - } - } - - private void append(WriterCommitMessage[] messages) { - AppendFiles append = table.newAppend(); - - int numFiles = 0; - for (DataFile file : files(messages)) { - numFiles += 1; - append.appendFile(file); - } - - commitOperation(append, numFiles, "append"); - } - - private void replacePartitions(WriterCommitMessage[] messages) { - Iterable files = files(messages); - - if (!files.iterator().hasNext()) { - LOG.info("Dyanmic overwrite is empty, skipping commit"); - return; - } - - ReplacePartitions dynamicOverwrite = table.newReplacePartitions(); - - int numFiles = 0; - for (DataFile file : files) { - numFiles += 1; - dynamicOverwrite.addFile(file); - } - - commitOperation(dynamicOverwrite, numFiles, "dynamic partition overwrite"); - } - - @Override - public void abort(WriterCommitMessage[] messages) { - if (cleanupOnAbort) { - Map props = table.properties(); - Tasks.foreach(files(messages)) - .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) - .exponentialBackoff( - PropertyUtil.propertyAsInt( - props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt( - props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt( - props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), - 2.0 /* exponential */) - .throwFailureWhenFinished() - .run( - file -> { - table.io().deleteFile(file.path().toString()); - }); - } else { - LOG.warn( - "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); - } - } - - protected Table table() { - return table; - } - - protected Iterable files(WriterCommitMessage[] messages) { - if (messages.length > 0) { - return Iterables.concat( - Iterables.transform( - Arrays.asList(messages), - message -> - message != null - ? ImmutableList.copyOf(((TaskCommit) message).files()) - : ImmutableList.of())); - } - return ImmutableList.of(); - } - - @Override - public String toString() { - return String.format("IcebergWrite(table=%s, format=%s)", table, format); - } - - private static class TaskCommit implements WriterCommitMessage { - private final DataFile[] taskFiles; - - TaskCommit(DataFile[] files) { - this.taskFiles = files; - } - - DataFile[] files() { - return this.taskFiles; - } - } - - static class WriterFactory implements DataWriterFactory { - private final Broadcast
tableBroadcast; - private final FileFormat format; - private final long targetFileSize; - private final Schema writeSchema; - private final StructType dsSchema; - private final boolean partitionedFanoutEnabled; - - WriterFactory( - Broadcast
tableBroadcast, - FileFormat format, - long targetFileSize, - Schema writeSchema, - StructType dsSchema, - boolean partitionedFanoutEnabled) { - this.tableBroadcast = tableBroadcast; - this.format = format; - this.targetFileSize = targetFileSize; - this.writeSchema = writeSchema; - this.dsSchema = dsSchema; - this.partitionedFanoutEnabled = partitionedFanoutEnabled; - } - - @Override - public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { - Table table = tableBroadcast.value(); - - OutputFileFactory fileFactory = - OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); - SparkAppenderFactory appenderFactory = - SparkAppenderFactory.builderFor(table, writeSchema, dsSchema).build(); - - PartitionSpec spec = table.spec(); - FileIO io = table.io(); - - if (spec.isUnpartitioned()) { - return new Unpartitioned24Writer( - spec, format, appenderFactory, fileFactory, io, targetFileSize); - } else if (partitionedFanoutEnabled) { - return new PartitionedFanout24Writer( - spec, format, appenderFactory, fileFactory, io, targetFileSize, writeSchema, dsSchema); - } else { - return new Partitioned24Writer( - spec, format, appenderFactory, fileFactory, io, targetFileSize, writeSchema, dsSchema); - } - } - } - - private static class Unpartitioned24Writer extends UnpartitionedWriter - implements DataWriter { - Unpartitioned24Writer( - PartitionSpec spec, - FileFormat format, - SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO fileIo, - long targetFileSize) { - super(spec, format, appenderFactory, fileFactory, fileIo, targetFileSize); - } - - @Override - public WriterCommitMessage commit() throws IOException { - close(); - - return new TaskCommit(dataFiles()); - } - } - - private static class Partitioned24Writer extends SparkPartitionedWriter - implements DataWriter { - - Partitioned24Writer( - PartitionSpec spec, - FileFormat format, - SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO fileIo, - long targetFileSize, - Schema schema, - StructType sparkSchema) { - super( - spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, sparkSchema); - } - - @Override - public WriterCommitMessage commit() throws IOException { - close(); - - return new TaskCommit(dataFiles()); - } - } - - private static class PartitionedFanout24Writer extends SparkPartitionedFanoutWriter - implements DataWriter { - - PartitionedFanout24Writer( - PartitionSpec spec, - FileFormat format, - SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO fileIo, - long targetFileSize, - Schema schema, - StructType sparkSchema) { - super( - spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, sparkSchema); - } - - @Override - public WriterCommitMessage commit() throws IOException { - close(); - - return new TaskCommit(dataFiles()); - } - } -} diff --git a/spark/v2.4/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/v2.4/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister deleted file mode 100644 index 01a6c4e0670d..000000000000 --- a/spark/v2.4/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ /dev/null @@ -1,20 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -org.apache.iceberg.spark.source.IcebergSource diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java deleted file mode 100644 index 6d88aaa11813..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import org.apache.spark.SparkConf; -import org.apache.spark.serializer.KryoSerializer; - -public class KryoHelpers { - - private KryoHelpers() {} - - @SuppressWarnings("unchecked") - public static T roundTripSerialize(T obj) throws IOException { - Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); - - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - - try (Output out = new Output(new ObjectOutputStream(bytes))) { - kryo.writeClassAndObject(out, obj); - } - - try (Input in = - new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { - return (T) kryo.readClassAndObject(in); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java deleted file mode 100644 index 235cf69ef449..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; -import org.junit.Assert; - -public final class TaskCheckHelper { - private TaskCheckHelper() {} - - public static void assertEquals(BaseCombinedScanTask expected, BaseCombinedScanTask actual) { - List expectedTasks = getFileScanTasksInFilePathOrder(expected); - List actualTasks = getFileScanTasksInFilePathOrder(actual); - - Assert.assertEquals( - "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); - - for (int i = 0; i < expectedTasks.size(); i++) { - FileScanTask expectedTask = expectedTasks.get(i); - FileScanTask actualTask = actualTasks.get(i); - assertEquals(expectedTask, actualTask); - } - } - - public static void assertEquals(FileScanTask expected, FileScanTask actual) { - assertEquals(expected.file(), actual.file()); - - // PartitionSpec implements its own equals method - Assert.assertEquals("PartitionSpec doesn't match", expected.spec(), actual.spec()); - - Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - - Assert.assertEquals( - "the number of bytes to scan doesn't match", expected.start(), actual.start()); - - // simplify comparison on residual expression via comparing toString - Assert.assertEquals( - "Residual expression doesn't match", - expected.residual().toString(), - actual.residual().toString()); - } - - public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); - Assert.assertEquals( - "Should match the serialized record format", expected.format(), actual.format()); - Assert.assertEquals( - "Should match the serialized record partition", - expected.partition().get(0, Object.class), - actual.partition().get(0, Object.class)); - Assert.assertEquals( - "Should match the serialized record count", expected.recordCount(), actual.recordCount()); - Assert.assertEquals( - "Should match the serialized record size", - expected.fileSizeInBytes(), - actual.fileSizeInBytes()); - Assert.assertEquals( - "Should match the serialized record value counts", - expected.valueCounts(), - actual.valueCounts()); - Assert.assertEquals( - "Should match the serialized record null value counts", - expected.nullValueCounts(), - actual.nullValueCounts()); - Assert.assertEquals( - "Should match the serialized record lower bounds", - expected.lowerBounds(), - actual.lowerBounds()); - Assert.assertEquals( - "Should match the serialized record upper bounds", - expected.upperBounds(), - actual.upperBounds()); - Assert.assertEquals( - "Should match the serialized record key metadata", - expected.keyMetadata(), - actual.keyMetadata()); - Assert.assertEquals( - "Should match the serialized record offsets", - expected.splitOffsets(), - actual.splitOffsets()); - Assert.assertEquals( - "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); - } - - private static List getFileScanTasksInFilePathOrder(BaseCombinedScanTask task) { - return task.files().stream() - // use file path + start position to differentiate the tasks - .sorted(Comparator.comparing(o -> o.file().path().toString() + "##" + o.start())) - .collect(Collectors.toList()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java deleted file mode 100644 index 33b5316b72b7..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Map; -import java.util.UUID; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkParquetWriters; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkConf; -import org.apache.spark.serializer.KryoSerializer; -import org.apache.spark.sql.catalyst.InternalRow; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestDataFileSerialization { - - private static final Schema DATE_SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec PARTITION_SPEC = - PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); - - private static final Map VALUE_COUNTS = Maps.newHashMap(); - private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); - private static final Map NAN_VALUE_COUNTS = Maps.newHashMap(); - private static final Map LOWER_BOUNDS = Maps.newHashMap(); - private static final Map UPPER_BOUNDS = Maps.newHashMap(); - - static { - VALUE_COUNTS.put(1, 5L); - VALUE_COUNTS.put(2, 3L); - VALUE_COUNTS.put(4, 2L); - NULL_VALUE_COUNTS.put(1, 0L); - NULL_VALUE_COUNTS.put(2, 2L); - NAN_VALUE_COUNTS.put(4, 1L); - LOWER_BOUNDS.put(1, longToBuffer(0L)); - UPPER_BOUNDS.put(1, longToBuffer(4L)); - } - - private static final DataFile DATA_FILE = - DataFiles.builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics( - new Metrics( - 5L, - null, - VALUE_COUNTS, - NULL_VALUE_COUNTS, - NAN_VALUE_COUNTS, - LOWER_BOUNDS, - UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void testDataFileKryoSerialization() throws Exception { - File data = temp.newFile(); - Assert.assertTrue(data.delete()); - Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); - - try (Output out = new Output(new FileOutputStream(data))) { - kryo.writeClassAndObject(out, DATA_FILE); - kryo.writeClassAndObject(out, DATA_FILE.copy()); - } - - try (Input in = new Input(new FileInputStream(data))) { - for (int i = 0; i < 2; i += 1) { - Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); - assertEquals(DATA_FILE, (DataFile) obj); - } - } - } - - @Test - public void testDataFileJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(DATA_FILE); - out.writeObject(DATA_FILE.copy()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 2; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); - assertEquals(DATA_FILE, (DataFile) obj); - } - } - } - - @Test - public void testParquetWriterSplitOffsets() throws IOException { - Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = - new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = - Parquet.write(Files.localOutput(parquetFile)) - .schema(DATE_SCHEMA) - .createWriterFunc( - msgType -> - SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) - .build(); - try { - writer.addAll(records); - } finally { - writer.close(); - } - - Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); - File dataFile = temp.newFile(); - try (Output out = new Output(new FileOutputStream(dataFile))) { - kryo.writeClassAndObject(out, writer.splitOffsets()); - } - try (Input in = new Input(new FileInputStream(dataFile))) { - kryo.readClassAndObject(in); - } - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java deleted file mode 100644 index b44e6cbb8d4c..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestFileIOSerialization { - - private static final Configuration CONF = new Configuration(); - private static final HadoopTables TABLES = new HadoopTables(CONF); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("date").build(); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - - static { - CONF.set("k1", "v1"); - CONF.set("k2", "v2"); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private Table table; - - @Before - public void initTable() throws IOException { - Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); - - File tableLocation = temp.newFolder(); - Assert.assertTrue(tableLocation.delete()); - - this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); - } - - @Test - public void testHadoopFileIOKryoSerialization() throws IOException { - FileIO io = table.io(); - Configuration expectedConf = ((HadoopFileIO) io).conf(); - - Table serializableTable = SerializableTable.copyOf(table); - FileIO deserializedIO = KryoHelpers.roundTripSerialize(serializableTable.io()); - Configuration actualConf = ((HadoopFileIO) deserializedIO).conf(); - - Assert.assertEquals("Conf pairs must match", toMap(expectedConf), toMap(actualConf)); - Assert.assertEquals("Conf values must be present", "v1", actualConf.get("k1")); - Assert.assertEquals("Conf values must be present", "v2", actualConf.get("k2")); - } - - @Test - public void testHadoopFileIOJavaSerialization() throws IOException, ClassNotFoundException { - FileIO io = table.io(); - Configuration expectedConf = ((HadoopFileIO) io).conf(); - - Table serializableTable = SerializableTable.copyOf(table); - FileIO deserializedIO = TestHelpers.roundTripSerialize(serializableTable.io()); - Configuration actualConf = ((HadoopFileIO) deserializedIO).conf(); - - Assert.assertEquals("Conf pairs must match", toMap(expectedConf), toMap(actualConf)); - Assert.assertEquals("Conf values must be present", "v1", actualConf.get("k1")); - Assert.assertEquals("Conf values must be present", "v2", actualConf.get("k2")); - } - - private Map toMap(Configuration conf) { - Map map = Maps.newHashMapWithExpectedSize(conf.size()); - conf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); - return map; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java deleted file mode 100644 index 92a646d3861b..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.ManifestFile.PartitionFieldSummary; -import org.apache.iceberg.hadoop.HadoopFileIO; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkConf; -import org.apache.spark.serializer.KryoSerializer; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestManifestFileSerialization { - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("double").build(); - - private static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics( - new Metrics( - 5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics( - new Metrics( - 1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); - - private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void testManifestFileKryoSerialization() throws IOException { - File data = temp.newFile(); - Assert.assertTrue(data.delete()); - - Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - try (Output out = new Output(new FileOutputStream(data))) { - kryo.writeClassAndObject(out, manifest); - kryo.writeClassAndObject(out, manifest.copy()); - kryo.writeClassAndObject(out, GenericManifestFile.copyOf(manifest).build()); - } - - try (Input in = new Input(new FileInputStream(data))) { - for (int i = 0; i < 3; i += 1) { - Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); - checkManifestFile(manifest, (ManifestFile) obj); - } - } - } - - @Test - public void testManifestFileJavaSerialization() throws Exception { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - - ManifestFile manifest = writeManifest(FILE_A, FILE_B); - - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(manifest); - out.writeObject(manifest.copy()); - out.writeObject(GenericManifestFile.copyOf(manifest).build()); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - for (int i = 0; i < 3; i += 1) { - Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); - checkManifestFile(manifest, (ManifestFile) obj); - } - } - } - - private void checkManifestFile(ManifestFile expected, ManifestFile actual) { - Assert.assertEquals("Path must match", expected.path(), actual.path()); - Assert.assertEquals("Length must match", expected.length(), actual.length()); - Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); - Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals( - "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals( - "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals( - "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals( - "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals( - "Existing files count must match", - expected.existingFilesCount(), - actual.existingFilesCount()); - Assert.assertEquals( - "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals( - "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals( - "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals( - "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); - - PartitionFieldSummary expectedPartition = expected.partitions().get(0); - PartitionFieldSummary actualPartition = actual.partitions().get(0); - - Assert.assertEquals( - "Null flag in partition must match", - expectedPartition.containsNull(), - actualPartition.containsNull()); - Assert.assertEquals( - "NaN flag in partition must match", - expectedPartition.containsNaN(), - actualPartition.containsNaN()); - Assert.assertEquals( - "Lower bounds in partition must match", - expectedPartition.lowerBound(), - actualPartition.lowerBound()); - Assert.assertEquals( - "Upper bounds in partition must match", - expectedPartition.upperBound(), - actualPartition.upperBound()); - } - - private ManifestFile writeManifest(DataFile... files) throws IOException { - File manifestFile = temp.newFile("input.m0.avro"); - Assert.assertTrue(manifestFile.delete()); - OutputFile outputFile = FILE_IO.newOutputFile(manifestFile.getCanonicalPath()); - - ManifestWriter writer = ManifestFiles.write(SPEC, outputFile); - try { - for (DataFile file : files) { - writer.add(file); - } - } finally { - writer.close(); - } - - return writer.toManifestFile(); - } - - private static ByteBuffer longToBuffer(long value) { - return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java deleted file mode 100644 index 4dd34f7a7611..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.source.ThreeColumnRecord; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkConf; -import org.apache.spark.serializer.KryoSerializer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestScanTaskSerialization extends SparkTestBase { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private String tableLocation = null; - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testBaseCombinedScanTaskKryoSerialization() throws Exception { - BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest(); - - File data = temp.newFile(); - Assert.assertTrue(data.delete()); - Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); - - try (Output out = new Output(new FileOutputStream(data))) { - kryo.writeClassAndObject(out, scanTask); - } - - try (Input in = new Input(new FileInputStream(data))) { - Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj) - .as("Should be a BaseCombinedScanTask") - .isInstanceOf(BaseCombinedScanTask.class); - TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); - } - } - - @Test - public void testBaseCombinedScanTaskJavaSerialization() throws Exception { - BaseCombinedScanTask scanTask = prepareBaseCombinedScanTaskForSerDeTest(); - - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - try (ObjectOutputStream out = new ObjectOutputStream(bytes)) { - out.writeObject(scanTask); - } - - try (ObjectInputStream in = - new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { - Object obj = in.readObject(); - Assertions.assertThat(obj) - .as("Should be a BaseCombinedScanTask") - .isInstanceOf(BaseCombinedScanTask.class); - TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); - } - } - - private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - return new BaseCombinedScanTask(Lists.newArrayList(tasks)); - } - - private void writeRecords(List records) { - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - writeDF(df); - } - - private void writeDF(Dataset df) { - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java deleted file mode 100644 index a4b86752cf3b..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestTableSerialization { - - private static final HadoopTables TABLES = new HadoopTables(); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("date").build(); - - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private Table table; - - @Before - public void initTable() throws IOException { - Map props = ImmutableMap.of("k1", "v1", "k2", "v2"); - - File tableLocation = temp.newFolder(); - Assert.assertTrue(tableLocation.delete()); - - this.table = TABLES.create(SCHEMA, SPEC, SORT_ORDER, props, tableLocation.toString()); - } - - @Test - public void testSerializableTableKryoSerialization() throws IOException { - Table serializableTable = SerializableTable.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata( - table, KryoHelpers.roundTripSerialize(serializableTable)); - } - - @Test - public void testSerializableMetadataTableKryoSerialization() throws IOException { - for (MetadataTableType type : MetadataTableType.values()) { - TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = - MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - Table serializableMetadataTable = SerializableTable.copyOf(metadataTable); - - TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); - } - } - - @Test - public void testSerializableTransactionTableKryoSerialization() throws IOException { - Transaction txn = table.newTransaction(); - - txn.updateProperties().set("k1", "v1").commit(); - - Table txnTable = txn.table(); - Table serializableTxnTable = SerializableTable.copyOf(txnTable); - - TestHelpers.assertSerializedMetadata( - txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/ValidationHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/ValidationHelpers.java deleted file mode 100644 index 70ab04f0a080..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/ValidationHelpers.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg; - -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.assertj.core.api.Assertions; - -public class ValidationHelpers { - - private ValidationHelpers() {} - - public static List dataSeqs(Long... seqs) { - return Arrays.asList(seqs); - } - - public static List fileSeqs(Long... seqs) { - return Arrays.asList(seqs); - } - - public static List snapshotIds(Long... ids) { - return Arrays.asList(ids); - } - - public static List files(ContentFile... files) { - return Arrays.stream(files).map(file -> file.path().toString()).collect(Collectors.toList()); - } - - public static void validateDataManifest( - Table table, - ManifestFile manifest, - List dataSeqs, - List fileSeqs, - List snapshotIds, - List files) { - - List actualDataSeqs = Lists.newArrayList(); - List actualFileSeqs = Lists.newArrayList(); - List actualSnapshotIds = Lists.newArrayList(); - List actualFiles = Lists.newArrayList(); - - for (ManifestEntry entry : ManifestFiles.read(manifest, table.io()).entries()) { - actualDataSeqs.add(entry.dataSequenceNumber()); - actualFileSeqs.add(entry.fileSequenceNumber()); - actualSnapshotIds.add(entry.snapshotId()); - actualFiles.add(entry.file().path().toString()); - } - - assertSameElements("data seqs", actualDataSeqs, dataSeqs); - assertSameElements("file seqs", actualFileSeqs, fileSeqs); - assertSameElements("snapshot IDs", actualSnapshotIds, snapshotIds); - assertSameElements("files", actualFiles, files); - } - - private static void assertSameElements(String context, List actual, List expected) { - String errorMessage = String.format("%s must match", context); - Assertions.assertThat(actual).as(errorMessage).hasSameElementsAs(expected); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java deleted file mode 100644 index 226cc897856f..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.stream.IntStream; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.source.ThreeColumnRecord; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.AnalysisException; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestRewriteDataFilesAction extends SparkTestBase { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private String tableLocation = null; - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testRewriteDataFilesEmptyTable() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - Assert.assertNull("Table must be empty", table.currentSnapshot()); - - Actions actions = Actions.forTable(table); - - actions.rewriteDataFiles().execute(); - - Assert.assertNull("Table must stay empty", table.currentSnapshot()); - } - - @Test - public void testRewriteDataFilesUnpartitionedTable() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); - - Actions actions = Actions.forTable(table); - - RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute(); - Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - table.refresh(); - - CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteDataFilesPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); - writeRecords(records2); - - List records3 = - Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); - writeRecords(records3); - - List records4 = - Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); - writeRecords(records4); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); - - Actions actions = Actions.forTable(table); - - RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute(); - Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size()); - - table.refresh(); - - CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - expectedRecords.addAll(records3); - expectedRecords.addAll(records4); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteDataFilesWithFilter() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); - writeRecords(records2); - - List records3 = - Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); - writeRecords(records3); - - List records4 = - Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); - writeRecords(records4); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); - - Actions actions = Actions.forTable(table); - - RewriteDataFilesActionResult result = - actions - .rewriteDataFiles() - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "AA")) - .execute(); - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - table.refresh(); - - CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = - Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); - Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - expectedRecords.addAll(records3); - expectedRecords.addAll(records4); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteLargeTableHasResiduals() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100"); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - // all records belong to the same partition - List records = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4))); - } - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - writeDF(df); - - table.refresh(); - - CloseableIterable tasks = - table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); - for (FileScanTask task : tasks) { - Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); - } - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); - - Actions actions = Actions.forTable(table); - - RewriteDataFilesActionResult result = - actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute(); - Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); - - table.refresh(); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testRewriteDataFilesForLargeFile() throws AnalysisException { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - Assert.assertNull("Table must be empty", table.currentSnapshot()); - - List records1 = Lists.newArrayList(); - - IntStream.range(0, 2000) - .forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i))); - Dataset df = spark.createDataFrame(records1, ThreeColumnRecord.class).repartition(1); - writeDF(df); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - DataFile maxSizeFile = - Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes)); - Assert.assertEquals("Should have 3 files before rewrite", 3, dataFiles.size()); - - spark.read().format("iceberg").load(tableLocation).createTempView("origin"); - long originalNumRecords = spark.read().format("iceberg").load(tableLocation).count(); - List originalRecords = sql("SELECT * from origin sort by c2"); - - Actions actions = Actions.forTable(table); - - long targetSizeInBytes = maxSizeFile.fileSizeInBytes() - 10; - RewriteDataFilesActionResult result = - actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); - - Assert.assertEquals("Action should delete 3 data files", 3, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFiles().size()); - - spark.read().format("iceberg").load(tableLocation).createTempView("postRewrite"); - long postRewriteNumRecords = spark.read().format("iceberg").load(tableLocation).count(); - List rewrittenRecords = sql("SELECT * from postRewrite sort by c2"); - - Assert.assertEquals(originalNumRecords, postRewriteNumRecords); - assertEquals("Rows should be unchanged", originalRecords, rewrittenRecords); - } - - private void writeRecords(List records) { - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - writeDF(df); - } - - private void writeDF(Dataset df) { - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - } - - @Test - public void testRewriteToOutputPartitionSpec() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - Map options = Maps.newHashMap(); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - table.updateSpec().addField(Expressions.truncate("c2", 2)).commit(); - - Assert.assertEquals("Should have 2 partitions specs", 2, table.specs().size()); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); - writeRecords(records2); - - List records3 = - Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); - writeRecords(records3); - - List records4 = - Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); - writeRecords(records4); - - table.refresh(); - - CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = - Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); - - Dataset beforeResultDF = spark.read().format("iceberg").load(tableLocation); - List beforeActualFilteredRecords = - beforeResultDF - .sort("c1", "c2", "c3") - .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); - Assert.assertEquals("Rows must match", records2, beforeActualFilteredRecords); - - Actions actions = Actions.forTable(table); - RewriteDataFilesActionResult result = actions.rewriteDataFiles().outputSpecId(0).execute(); - Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size()); - Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); - - Assert.assertTrue(result.deletedDataFiles().stream().allMatch(df -> df.specId() == 1)); - Assert.assertTrue(result.addedDataFiles().stream().allMatch(df -> df.specId() == 0)); - - table.refresh(); - - CloseableIterable tasks2 = table.newScan().planFiles(); - List dataFiles2 = - Lists.newArrayList(CloseableIterable.transform(tasks2, FileScanTask::file)); - Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles2.size()); - - // Should still have all the same data - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - expectedRecords.addAll(records3); - expectedRecords.addAll(records4); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - - List actualFilteredRecords = - resultDF - .sort("c1", "c2", "c3") - .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); - Assert.assertEquals("Rows must match", records2, actualFilteredRecords); - - List records5 = - Lists.newArrayList( - new ThreeColumnRecord(3, "CCCCCCCCCC", "FFFF"), - new ThreeColumnRecord(3, "CCCCCCCCCC", "HHHH")); - writeRecords(records5); - expectedRecords.addAll(records5); - actualRecords = - resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java deleted file mode 100644 index 715b953b443a..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.examples; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import org.apache.commons.io.FileUtils; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** This class tests how Iceberg handles concurrency when reading and writing at the same time */ -public class ConcurrencyTest { - - private static final Logger log = LoggerFactory.getLogger(ConcurrencyTest.class); - - private Schema schema = - new Schema( - optional(1, "key", Types.LongType.get()), optional(2, "value", Types.StringType.get())); - private SparkSession spark; - private File tableLocation; - private Table table; - - private List data = Lists.newArrayList(); - - @Before - public void before() throws IOException { - tableLocation = Files.createTempDirectory("temp").toFile(); - - spark = SparkSession.builder().master("local[2]").getOrCreate(); - spark.sparkContext().setLogLevel("WARN"); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - table = tables.create(schema, tableLocation.toString()); - - for (int i = 0; i < 1000000; i++) { - data.add(new SimpleRecord(1, "bdp")); - } - - log.info("End of setup phase"); - } - - /** - * The test creates 500 read tasks and one really long write (writing 1 mil rows) and uses - * threading to call the tasks concurrently. - */ - @Test - public void writingAndReadingConcurrently() throws InterruptedException { - ExecutorService threadPool = Executors.newFixedThreadPool(5); - List> tasks = Lists.newArrayList(); - - Callable write = () -> writeToTable(data); - tasks.add(write); - - for (int i = 0; i < 500; i++) { - Callable getReads = () -> readTable(); - tasks.add(getReads); - } - - threadPool.invokeAll(tasks); - threadPool.shutdown(); - - table.refresh(); - readTable(); - } - - private Void readTable() { - Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - - log.info("" + results.count()); - return null; - } - - private Void writeToTable(List writeData) { - log.info("WRITING!"); - Dataset df = spark.createDataFrame(writeData, SimpleRecord.class); - df.select("key", "value") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation.toString()); - return null; - } - - @After - public void after() throws IOException { - spark.stop(); - FileUtils.deleteDirectory(tableLocation); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/README.md b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/README.md deleted file mode 100644 index eca410dfeabf..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/README.md +++ /dev/null @@ -1,195 +0,0 @@ -# Iceberg Java API Examples (with Spark) - -## About -Welcome! :smile: - -If you've stumbled across this module, hopefully you're looking for some guidance on how to get started with the [Apache Iceberg](https://iceberg.apache.org/) table format. This set of classes collects code examples of how to use the Iceberg Java API with Spark, along with some extra detail here in the README. - -The examples are structured as JUnit tests that you can download and run locally if you want to mess around with Iceberg yourself. - -## Using Iceberg -### Maven -If you'd like to try out Iceberg in your own project using Spark, you can use the `iceberg-spark-runtime` dependency: -```xml - - org.apache.iceberg - iceberg-spark-runtime - ${iceberg.version} - -``` - -You'll also need `spark-sql`: -```xml - - org.apache.spark - spark-sql_2.11 - 2.4.4 - -``` - -### Gradle -To add a dependency on Iceberg in Gradle, add the following to `build.gradle`: -``` -dependencies { - compile 'org.apache.iceberg:iceberg-core:0.8.0-incubating' -} -``` - -## Key features investigated -The following section will break down the different areas of Iceberg explored in the examples, with links to the code and extra information that could be useful for new users. - -### Writing data to tables -There are multiple ways of creating tables with Iceberg, including using the Hive Metastore to keep track of tables ([HiveCatalog](https://iceberg.apache.org/java-api-quickstart/#using-a-hive-catalog)), or using HDFS / your local file system ([HadoopTables](https://iceberg.apache.org/java-api-quickstart/#using-hadoop-tables)) to store the tables. However, it should be noted that directory tables (such as those using `HadoopTables`) don’t support all catalog operations, like rename and therefore use the `Tables` interface instead of the `Catalog` interface. -It should be noted that `HadoopTables` _shouldn’t_ be used with file systems that do not support atomic rename as Iceberg depends on this to synchronize concurrent commits. -To limit complexity, these examples create tables on your local file system using the `HadoopTables` class. - -To create an Iceberg `Table` you will need to use the Iceberg API to create a `Schema` and `PartitionSpec` which you use with a Spark `DataFrameWriter`. - -Code examples can be found [here](ReadAndWriteTablesTest.java). - -#### A quick look at file structures -It could be interesting to note that when writing partitioned data, Iceberg will layout your files in a similar manner to Hive: - -``` -├── data -│   ├── published_month=2017-09 -│   │   └── 00000-1-5cbc72f6-7c1a-45e4-bb26-bc30deaca247-00002.parquet -│   ├── published_month=2018-09 -│   │   └── 00000-1-5cbc72f6-7c1a-45e4-bb26-bc30deaca247-00001.parquet -│   ├── published_month=2018-11 -│   │   └── 00000-1-5cbc72f6-7c1a-45e4-bb26-bc30deaca247-00000.parquet -│   └── published_month=null -│   └── 00000-1-5cbc72f6-7c1a-45e4-bb26-bc30deaca247-00003.parquet -└── metadata - └── version-hint.text -``` -**WARNING** -It should be noted that it is not possible to just drag-and-drop data files into an Iceberg table like the one shown above and expect to see your data in the table. -Each file is tracked individually and is managed by Iceberg, and so must be written into the table using the Iceberg API. - -### Reading data from tables -Reading Iceberg tables is fairly simple using the Spark `DataFrameReader`. - -Code examples can be found [here](ReadAndWriteTablesTest.java). - -### A look at the metadata -This section looks a little bit closer at the metadata produced by Iceberg tables. Consider an example where you've written some data to a table. Your files will look something like this: - -``` -├── data -│   └── ... -└── metadata - ├── 51accd1d-39c7-4a6e-8f35-9e05f7c67864-m0.avro - ├── snap-1335014336004891572-1-51accd1d-39c7-4a6e-8f35-9e05f7c67864.avro - ├── v1.metadata.json - ├── v2.metadata.json - └── version-hint.text -``` - -The metadata for your table is kept in json files and each commit to a table will produce a new metadata file. For tables using a metastore for the metadata, the file used is whichever file the metastore points at. For `HadoopTables`, the file used will be the latest version available. Look [here](https://iceberg.apache.org/spec/#table-metadata) for more information on metadata. - -The metadata file will contain things like the table location, the schema and the partition spec: - -```json -{ - "format-version" : 1, - "table-uuid" : "f31aa6d7-acc3-4365-b737-4ef028a60bc1", - "location" : "/var/folders/sg/ypkyhl2s0p18qcd10ddpkn0c0000gn/T/temp5216691795982307214", - "last-updated-ms" : 1572972868185, - "last-column-id" : 2, - "schema" : { - "type" : "struct", - "fields" : [ { - ... - } ] - }, - "partition-spec" : [ { - ... - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - ... - } ] - } ], - "last-partition-id" : 1000, - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ] -} -``` - -When you then add your first chunk of data, you get a new version of the metadata (`v2.metadata.json`) that is the same as the first version except for the snapshot section at the bottom, which gets updated to: - -```json -"current-snapshot-id" : 8405273199394950821, - "snapshots" : [ { - "snapshot-id" : 8405273199394950821, - "timestamp-ms" : 1572972873293, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1572972867758", - "added-data-files" : "4", - "added-records" : "4", - "changed-partition-count" : "4", - "total-records" : "4", - "total-data-files" : "4" - }, - "manifest-list" : "/var/folders/sg/ypkyhl2s0p18qcd10ddpkn0c0000gn/T/temp5216691795982307214/metadata/snap-8405273199394950821-1-5706fc75-31e1-404e-aa23-b493387e2e32.avro" - } ], - "snapshot-log" : [ { - "timestamp-ms" : 1572972873293, - "snapshot-id" : 8405273199394950821 - } ] -``` - -Here you get information on the data you have just written to the table, such as `added-records` and `added-data-files` as well as where the manifest list is located. - - -### Snapshot based functionality -Iceberg uses [snapshots](https://iceberg.apache.org/terms/#snapshot) as part of its implementation, and provides a lot of useful functionality from this, such as **time travel**. - -- Iceberg creates a new snapshot for all table operations that modify the table, such as appends and overwrites. -- You are able to access the whole list of snapshots generated for a table. -- Iceberg will store all snapshots generated until you delete the snapshots using the `ExpireSnapshots` API. Currently, this must be called by the user. - - **NOTE**: A VACUUM operation with Spark is in the works for a future release to make this process easier. - - You can delete all snapshots earlier than a certain timestamp. - - You can delete snaphots based on `SnapshotID` values. -- You can read data from an old snapshot using the `SnapshotID` or a timestamp value ([time travel](https://iceberg.apache.org/spark/#time-travel)). -- You can roll back your data to an earlier snapshot. - -Code examples can be found [here](SnapshotFunctionalityTest.java). - -### Table schema evolution -Iceberg provides support to handle schema evolution of your tables over time: - -1. Add a new column - 1. The new column is always added at the end of the table (**NOTE**: This is fixed in Spark 3 which has implemented AFTER and FIRST operations). - 1. You are only able to add a column at the end of the schema, not somewhere in the middle. - 1. Any rows using the earlier schema return a `null` value for this new column. You cannot use an alternative default value. - 1. This column automatically becomes an `optional` column, meaning adding data to this column isn't enforced for each future write. -1. Delete a column - 1. When you delete a column, that column will no longer be available in any of your previous snapshots. So, use this with caution :sweat_smile: -1. Update a column - 1. Certain type promotions can be made (such as `int` -> `long`). For a definitive list, see the [official documentation](https://iceberg.apache.org/spec/#schemas-and-data-types). -1. Rename a column - 1. When you rename a column, it will appear renamed in all earlier versions of snapshots. - -Code examples can be found [here](SchemaEvolutionTest.java). - -### Optimistic concurrency -[Optimistic concurrency](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) is when a system assumes that multiple writers can write to the same table without interfering with each other. This is usually used in environments where there is low data contention. It means that locking of the table isn't used, allowing multiple writers to write to the table at the same time. - -However, this means you need to occasionally deal with concurrent writer conflicts. This is when multiple writers start writing to a table at the same time, but one finishes first and commits an update. Then when the second writer tries to commit it has to throw an error because the table isn't in the same state as it was when it started writing. - -Iceberg deals with this by attempting retries of the write based on the new metadata. This can happen if the files the first write changed aren't touched by the second write, then it's deemed safe to commit the second update. - -[This test](ConcurrencyTest.java) looks to experiment with how optimistic concurrency works. For more information on conflict resolution, look [here](https://iceberg.apache.org/spec/#table-metadata) and for information on write concurrency, look [here](https://iceberg.apache.org/reliability/#concurrent-write-operations). - -By default, Iceberg has set the `commit.retry.num-retries` property to **4**. You can edit this default by creating an `UpdateProperties` object and assigning a new number to that property: - -```java - table.updateProperties().set("commit.retry.num-retries", "1").commit(); -``` - -You can find more information on other table properties you can configure [here](https://iceberg.apache.org/configuration/#table-properties). diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java deleted file mode 100644 index 0b74d49f44bf..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.examples; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -/** This test class uses Spark to create partitioned and unpartitioned tables locally. */ -public class ReadAndWriteTablesTest { - - private SparkSession spark; - private Table table; - private HadoopTables tables; - private File pathToTable; - private Schema schema; - - @Before - public void before() throws IOException { - spark = SparkSession.builder().master("local[2]").getOrCreate(); - - pathToTable = Files.createTempDirectory("temp").toFile(); - tables = new HadoopTables(spark.sessionState().newHadoopConf()); - - schema = - new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - } - - @Test - public void createUnpartitionedTable() { - table = tables.create(schema, pathToTable.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString()); - - table.refresh(); - } - - @Test - public void createPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(schema).identity("id").build(); - - table = tables.create(schema, spec, pathToTable.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString()); - - table.refresh(); - } - - @Test - public void writeDataFromJsonFile() { - Schema bookSchema = - new Schema( - optional(1, "title", Types.StringType.get()), - optional(2, "price", Types.LongType.get()), - optional(3, "author", Types.StringType.get()), - optional(4, "published", Types.TimestampType.withZone()), - optional(5, "genre", Types.StringType.get())); - - table = tables.create(bookSchema, pathToTable.toString()); - - Dataset df = spark.read().json("src/test/resources/data/books.json"); - - df.select( - df.col("title"), - df.col("price"), - df.col("author"), - df.col("published").cast(DataTypes.TimestampType), - df.col("genre")) - .write() - .format("iceberg") - .mode("append") - .save(pathToTable.toString()); - - table.refresh(); - } - - @Test - public void readFromIcebergTableWithSpark() { - table = tables.create(schema, pathToTable.toString()); - - Dataset results = spark.read().format("iceberg").load(pathToTable.toString()); - - results.createOrReplaceTempView("table"); - spark.sql("select * from table").show(); - } - - @Test - public void readFromPartitionedTableWithFilter() { - table = tables.create(schema, pathToTable.toString()); - - Dataset results = - spark.read().format("iceberg").load(pathToTable.toString()).filter("data != \"b\""); - - results.createOrReplaceTempView("table"); - spark.sql("SELECT * FROM table").show(); - } - - @After - public void after() throws IOException { - FileUtils.deleteDirectory(pathToTable); - spark.stop(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java deleted file mode 100644 index da0ad897a354..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.examples; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; -import java.util.stream.Stream; -import org.apache.commons.io.FileUtils; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.LongType$; -import org.apache.spark.sql.types.StructField; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class tests how you can evolve your table schema with Iceberg. This includes things like - * adding, deleting, renaming columns and type promotions. - */ -public class SchemaEvolutionTest { - - private static final Logger log = LoggerFactory.getLogger(SchemaEvolutionTest.class); - - private static SparkSession spark; - private Table table; - private File tableLocation; - private final String dataLocation = "src/test/resources/data/"; - - @BeforeClass - public static void beforeAll() { - spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @Before - public void before() throws IOException { - tableLocation = Files.createTempDirectory("temp").toFile(); - Schema schema = - new Schema( - optional(1, "title", Types.StringType.get()), - optional(2, "price", Types.IntegerType.get()), - optional(3, "author", Types.StringType.get()), - optional(4, "published", Types.TimestampType.withZone()), - optional(5, "genre", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema).year("published").build(); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - table = tables.create(schema, spec, tableLocation.toString()); - - Dataset df = spark.read().json(dataLocation + "/books.json"); - - df.select( - df.col("title"), - df.col("price").cast(DataTypes.IntegerType), - df.col("author"), - df.col("published").cast(DataTypes.TimestampType), - df.col("genre")) - .write() - .format("iceberg") - .mode("append") - .save(tableLocation.toString()); - - table.refresh(); - } - - @Test - public void addColumnToSchema() { - String fieldName = "publisher"; - Schema schema = table.schema(); - Assert.assertNull(schema.findField(fieldName)); - - table.updateSchema().addColumn(fieldName, Types.StringType.get()).commit(); - Dataset df2 = spark.read().json(dataLocation + "new-books.json"); - - df2.select( - df2.col("title"), - df2.col("price").cast(DataTypes.IntegerType), - df2.col("author"), - df2.col("published").cast(DataTypes.TimestampType), - df2.col("genre"), - df2.col("publisher")) - .write() - .format("iceberg") - .mode("append") - .save(tableLocation.toString()); - } - - @Test - public void deleteColumnFromSchema() { - table.updateSchema().deleteColumn("genre").commit(); - - table.refresh(); - Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - - results.createOrReplaceTempView("table"); - spark.sql("select * from table").show(); - Assert.assertFalse(Arrays.asList(results.schema().names()).contains("genre")); - } - - @Test - public void renameColumn() { - table.updateSchema().renameColumn("author", "writer").commit(); - - table.refresh(); - Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - - results.createOrReplaceTempView("table"); - spark.sql("select * from table").show(); - List fields = Arrays.asList(spark.sql("select * from table").schema().names()); - Assert.assertTrue(fields.contains("writer")); - Assert.assertFalse(fields.contains("author")); - } - - @Test - public void updateColumnTypeIntToLong() { - table.updateSchema().updateColumn("price", Types.LongType.get()).commit(); - - Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - - Stream structFieldStream = - Arrays.stream(results.schema().fields()) - .filter(field -> field.name().equalsIgnoreCase("price")); - Optional first = structFieldStream.findFirst(); - Assert.assertTrue( - "Unable to change datatype from Long to Int", - first.isPresent() && first.get().dataType() == LongType$.MODULE$); - } - - @Test - public void updateColumnTypeIntToString() { - Assertions.assertThatThrownBy( - () -> table.updateSchema().updateColumn("price", Types.StringType.get()).commit()) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot change column type: price: int -> string"); - } - - @Test - public void updateColumnTypeStringToInt() { - Assertions.assertThatThrownBy( - () -> table.updateSchema().updateColumn("author", Types.IntegerType.get()).commit()) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot change column type: author: string -> int"); - } - - @Test - public void floatToDouble() throws IOException { - // Set up a new table to test this conversion - Schema schema = new Schema(optional(1, "float", Types.FloatType.get())); - File location = Files.createTempDirectory("temp").toFile(); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table floatTable = tables.create(schema, location.toString()); - - floatTable.updateSchema().updateColumn("float", Types.DoubleType.get()).commit(); - - log.info("Promote float type to double type:\n" + floatTable.schema().toString()); - } - - @Test - public void widenDecimalPrecision() throws IOException { - // Set up a new table to test this conversion - Schema schema = new Schema(optional(1, "decimal", Types.DecimalType.of(2, 2))); - File location = Files.createTempDirectory("temp").toFile(); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table decimalTable = tables.create(schema, location.toString()); - - decimalTable.updateSchema().updateColumn("decimal", Types.DecimalType.of(4, 2)).commit(); - - log.info("Widen decimal type:\n" + decimalTable.schema().toString()); - } - - @Test - public void after() throws IOException { - FileUtils.deleteDirectory(tableLocation); - } - - @AfterClass - public static void afterAll() { - spark.stop(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java deleted file mode 100644 index 8d4e4c7e1668..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.examples; - -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -public class SimpleRecord { - private Integer id; - private String data; - - public SimpleRecord() {} - - SimpleRecord(Integer id, String data) { - this.id = id; - this.data = data; - } - - public Integer getId() { - return id; - } - - public void setId(Integer id) { - this.id = id; - } - - public String getData() { - return data; - } - - public void setData(String data) { - this.data = data; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - SimpleRecord record = (SimpleRecord) o; - return Objects.equal(id, record.id) && Objects.equal(data, record.data); - } - - @Override - public int hashCode() { - return Objects.hashCode(id, data); - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append("{\"id\"="); - buffer.append(id); - buffer.append(",\"data\"=\""); - buffer.append(data); - buffer.append("\"}"); - return buffer.toString(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java deleted file mode 100644 index 82226a716c0a..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.examples; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.Iterator; -import java.util.List; -import org.apache.commons.collections.IteratorUtils; -import org.apache.commons.io.FileUtils; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class tests the snapshot functionality available with Iceberg. This includes things like - * time-travel, rollback and retrieving metadata. - */ -public class SnapshotFunctionalityTest { - - private static final Logger log = LoggerFactory.getLogger(SnapshotFunctionalityTest.class); - - private Table table; - private File tableLocation; - private SparkSession spark = null; - - @Before - public void before() throws IOException { - Schema schema = - new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - spark = SparkSession.builder().master("local[2]").getOrCreate(); - - tableLocation = Files.createTempDirectory("temp").toFile(); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - PartitionSpec spec = PartitionSpec.unpartitioned(); - table = tables.create(schema, spec, tableLocation.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - for (int i = 0; i < 5; i++) { - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation.toString()); - } - table.refresh(); - } - - @Test - public void rollbackToPreviousSnapshotAndReadData() { - long oldId = table.history().get(0).snapshotId(); - - table.manageSnapshots().rollbackTo(oldId).commit(); - table.refresh(); - - Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - - results.createOrReplaceTempView("table"); - spark.sql("select * from table").show(); - } - - @Test - public void expireOldSnapshotWithSnapshotID() { - long oldId = table.history().get(0).snapshotId(); - - table.expireSnapshots().expireSnapshotId(oldId).commit(); - table.refresh(); - - Iterator iterator = table.snapshots().iterator(); - List snapshots = IteratorUtils.toList(iterator); - } - - /** Expires anything older than a given timestamp, NOT including that timestamp. */ - @Test - public void retireAllSnapshotsOlderThanTimestamp() { - long secondLatestTimestamp = table.history().get(2).timestampMillis(); - Iterator beforeIterator = table.snapshots().iterator(); - List beforeSnapshots = IteratorUtils.toList(beforeIterator); - - // Delete the 2 oldest snapshots - table.expireSnapshots().expireOlderThan(secondLatestTimestamp).commit(); - table.refresh(); - - Iterator afterIterator = table.snapshots().iterator(); - List afterSnapshots = IteratorUtils.toList(afterIterator); - } - - @Test - public void getInfoAboutFilesAddedFromSnapshot() { - Snapshot snapshot = table.currentSnapshot(); - Iterable addedFiles = snapshot.addedDataFiles(table.io()); - - for (DataFile dataFile : addedFiles) { - log.info("File path: " + dataFile.path()); - log.info("File format: " + dataFile.format()); - log.info("File size in bytes: " + dataFile.fileSizeInBytes()); - log.info("Record count: " + dataFile.recordCount()); - } - } - - @After - public void after() throws IOException { - FileUtils.deleteDirectory(tableLocation); - spark.stop(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java deleted file mode 100644 index f2d3bd0b0764..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.internal.SQLConf; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; - -public abstract class SparkTestBase { - - protected static final Object ANY = new Object(); - - protected static TestHiveMetastore metastore = null; - protected static HiveConf hiveConf = null; - protected static SparkSession spark = null; - protected static HiveCatalog catalog = null; - - @BeforeClass - public static void startMetastoreAndSpark() { - SparkTestBase.metastore = new TestHiveMetastore(); - metastore.start(); - SparkTestBase.hiveConf = metastore.hiveConf(); - - SparkTestBase.spark = - SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); - - SparkTestBase.catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - - try { - catalog.createNamespace(Namespace.of("default")); - } catch (AlreadyExistsException ignored) { - // the default namespace already exists. ignore the create error - } - } - - @AfterClass - public static void stopMetastoreAndSpark() throws Exception { - SparkTestBase.catalog = null; - if (metastore != null) { - metastore.stop(); - SparkTestBase.metastore = null; - } - if (spark != null) { - spark.stop(); - SparkTestBase.spark = null; - } - } - - protected long waitUntilAfter(long timestampMillis) { - long current = System.currentTimeMillis(); - while (current <= timestampMillis) { - current = System.currentTimeMillis(); - } - return current; - } - - protected List sql(String query, Object... args) { - List rows = spark.sql(String.format(query, args)).collectAsList(); - if (rows.size() < 1) { - return ImmutableList.of(); - } - - return rowsToJava(rows); - } - - protected List rowsToJava(List rows) { - return rows.stream().map(this::toJava).collect(Collectors.toList()); - } - - private Object[] toJava(Row row) { - return IntStream.range(0, row.size()) - .mapToObj( - pos -> { - if (row.isNullAt(pos)) { - return null; - } - - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) - .toArray(Object[]::new); - } - - protected Object scalarSql(String query, Object... args) { - List rows = sql(query, args); - Assert.assertEquals("Scalar SQL should return one row", 1, rows.size()); - Object[] row = Iterables.getOnlyElement(rows); - Assert.assertEquals("Scalar SQL should return one value", 1, row.length); - return row[0]; - } - - protected Object[] row(Object... values) { - return values; - } - - protected void assertEquals( - String context, List expectedRows, List actualRows) { - Assert.assertEquals( - context + ": number of results should match", expectedRows.size(), actualRows.size()); - for (int row = 0; row < expectedRows.size(); row += 1) { - Object[] expected = expectedRows.get(row); - Object[] actual = actualRows.get(row); - Assert.assertEquals("Number of columns should match", expected.length, actual.length); - for (int col = 0; col < actualRows.get(row).length; col += 1) { - String newContext = String.format("%s: row %d col %d", context, row + 1, col + 1); - assertEquals(newContext, expected, actual); - } - } - } - - private void assertEquals(String context, Object[] expectedRow, Object[] actualRow) { - Assert.assertEquals("Number of columns should match", expectedRow.length, actualRow.length); - for (int col = 0; col < actualRow.length; col += 1) { - Object expectedValue = expectedRow[col]; - Object actualValue = actualRow[col]; - if (expectedValue != null && expectedValue.getClass().isArray()) { - String newContext = String.format("%s (nested col %d)", context, col + 1); - assertEquals(newContext, (Object[]) expectedValue, (Object[]) actualValue); - } else if (expectedValue != ANY) { - Assert.assertEquals(context + " contents should match", expectedValue, actualValue); - } - } - } - - protected static String dbPath(String dbName) { - return metastore.getDatabasePath(dbName); - } - - protected void withSQLConf(Map conf, Action action) { - SQLConf sqlConf = SQLConf.get(); - - Map currentConfValues = Maps.newHashMap(); - conf.keySet() - .forEach( - confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach( - (confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); - - try { - action.invoke(); - } finally { - conf.forEach( - (confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); - } - } - - @FunctionalInterface - protected interface Action { - void invoke(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java deleted file mode 100644 index 4e6331982d85..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.IOException; -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - -public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - @Test - public void testEstiamteSizeMaxValue() throws IOException { - Assert.assertEquals( - "estimateSize returns Long max value", - Long.MAX_VALUE, - SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); - } - - @Test - public void testEstiamteSizeWithOverflow() throws IOException { - long tableSize = - SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); - Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); - } - - @Test - public void testEstiamteSize() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), 1); - Assert.assertEquals("estimateSize matches with expected approximation", 24, tableSize); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java deleted file mode 100644 index 7f00c7edd8a9..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.junit.Assert; -import org.junit.Test; - -public class TestSparkValueConverter { - @Test - public void testSparkNullMapConvert() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()))))); - - assertCorrectNullConversion(schema); - } - - @Test - public void testSparkNullListConvert() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); - - assertCorrectNullConversion(schema); - } - - @Test - public void testSparkNullStructConvert() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - assertCorrectNullConversion(schema); - } - - @Test - public void testSparkNullPrimitiveConvert() { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get())); - assertCorrectNullConversion(schema); - } - - private void assertCorrectNullConversion(Schema schema) { - Row sparkRow = RowFactory.create(1, null); - Record record = GenericRecord.create(schema); - record.set(0, 1); - Assert.assertEquals( - "Round-trip conversion should produce original value", - record, - SparkValueConverter.convert(schema, sparkRow)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java deleted file mode 100644 index 7124c51ddd3d..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.StreamSupport; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.HasTableOperations; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.actions.ActionsProvider; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.actions.DeleteReachableFiles; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestDeleteReachableFilesAction extends SparkTestBase { - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - private static final int SHUFFLE_PARTITIONS = 2; - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - - static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = - DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = - DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private Table table; - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - String tableLocation = tableDir.toURI().toString(); - this.table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); - } - - private void checkRemoveFilesResults( - long expectedDatafiles, - long expectedManifestsDeleted, - long expectedManifestListsDeleted, - long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals( - "Incorrect number of manifest files deleted", - expectedManifestsDeleted, - results.deletedManifestsCount()); - Assert.assertEquals( - "Incorrect number of datafiles deleted", - expectedDatafiles, - results.deletedDataFilesCount()); - Assert.assertEquals( - "Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, - results.deletedManifestListsCount()); - Assert.assertEquals( - "Incorrect number of other lists deleted", - expectedOtherFilesDeleted, - results.deletedOtherFilesCount()); - } - - @Test - public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend().appendFile(FILE_A).commit(); - - table.newFastAppend().appendFile(FILE_B).commit(); - - table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - - table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); - - Set deletedFiles = ConcurrentHashMap.newKeySet(); - Set deleteThreads = ConcurrentHashMap.newKeySet(); - AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - - DeleteReachableFiles.Result result = - sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith( - Executors.newFixedThreadPool( - 4, - runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon( - true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith( - s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService - // ThreadFactory - Assert.assertEquals( - deleteThreads, - Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) - .forEach( - file -> - Assert.assertTrue( - "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); - checkRemoveFilesResults(4L, 6L, 4L, 6, result); - } - - @Test - public void testWithExpiringDanglingStageCommit() { - table.location(); - // `A` commit - table.newAppend().appendFile(FILE_A).commit(); - - // `B` staged commit - table.newAppend().appendFile(FILE_B).stageOnly().commit(); - - // `C` commit - table.newAppend().appendFile(FILE_C).commit(); - - DeleteReachableFiles.Result result = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); - - checkRemoveFilesResults(3L, 3L, 3L, 5, result); - } - - @Test - public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); - - checkRemoveFilesResults(0, 0, 0, 2, result); - } - - @Test - public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend().appendFile(FILE_A).commit(); - - table.newAppend().appendFile(FILE_B).commit(); - - table.newAppend().appendFile(FILE_B).commit(); - - table.newAppend().appendFile(FILE_C).commit(); - - table.newAppend().appendFile(FILE_D).commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); - DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); - - checkRemoveFilesResults(4, 5, 5, 8, result); - } - - @Test - public void testRemoveFilesAction() { - table.newAppend().appendFile(FILE_A).commit(); - - table.newAppend().appendFile(FILE_B).commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); - } - - @Test - public void testRemoveFilesActionWithDefaultIO() { - table.newAppend().appendFile(FILE_A).commit(); - - table.newAppend().appendFile(FILE_B).commit(); - - // IO not set explicitly on removeReachableFiles action - // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = - sparkActions().deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); - } - - @Test - public void testUseLocalIterator() { - table.newFastAppend().appendFile(FILE_A).commit(); - - table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - - table.newFastAppend().appendFile(FILE_C).commit(); - - int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - - DeleteReachableFiles.Result results = - sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true") - .execute(); - - int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); - int totalJobsRun = jobsAfter - jobsBefore; - - checkRemoveFilesResults(3L, 4L, 3L, 5, results); - - Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", - totalJobsRun, - SHUFFLE_PARTITIONS); - } - - @Test - public void testIgnoreMetadataFilesNotFound() { - table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - - table.newAppend().appendFile(FILE_A).commit(); - // There are three metadata json files at this point - DeleteOrphanFiles.Result result = - sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue( - "Should remove v1 file", - StreamSupport.stream(result.orphanFileLocations().spliterator(), false) - .anyMatch(file -> file.contains("v1.metadata.json"))); - - DeleteReachableFiles baseRemoveFilesSparkAction = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); - DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - - checkRemoveFilesResults(1, 1, 1, 4, res); - } - - @Test - public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = - sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); - AssertHelpers.assertThrows( - "FileIO needs to be set to use RemoveFiles action", - IllegalArgumentException.class, - "File IO cannot be null", - baseRemoveFilesSparkAction::execute); - } - - @Test - public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - - AssertHelpers.assertThrows( - "Should complain about removing files when GC is disabled", - ValidationException.class, - "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", - () -> sparkActions().deleteReachableFiles(metadataLocation(table))); - } - - private String metadataLocation(Table tbl) { - return ((HasTableOperations) tbl).operations().current().metadataFileLocation(); - } - - private ActionsProvider sparkActions() { - return SparkActions.get(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java deleted file mode 100644 index 7a50ee2f5d22..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ /dev/null @@ -1,1121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.ExpireSnapshots; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestExpireSnapshotsAction extends SparkTestBase { - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - private static final int SHUFFLE_PARTITIONS = 2; - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - - static final DataFile FILE_A = - DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = - DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = - DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = - DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private File tableDir; - private String tableLocation; - private Table table; - - @Before - public void setupTableLocation() throws Exception { - this.tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - this.table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); - } - - private Long rightAfterSnapshot() { - return rightAfterSnapshot(table.currentSnapshot().snapshotId()); - } - - private Long rightAfterSnapshot(long snapshotId) { - Long end = System.currentTimeMillis(); - while (end <= table.snapshot(snapshotId).timestampMillis()) { - end = System.currentTimeMillis(); - } - return end; - } - - private void checkExpirationResults( - long expectedDatafiles, - long expectedManifestsDeleted, - long expectedManifestListsDeleted, - ExpireSnapshots.Result results) { - - Assert.assertEquals( - "Incorrect number of manifest files deleted", - expectedManifestsDeleted, - results.deletedManifestsCount()); - Assert.assertEquals( - "Incorrect number of datafiles deleted", - expectedDatafiles, - results.deletedDataFilesCount()); - Assert.assertEquals( - "Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, - results.deletedManifestListsCount()); - } - - @Test - public void testFilesCleaned() throws Exception { - table.newFastAppend().appendFile(FILE_A).commit(); - - table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - - table.newFastAppend().appendFile(FILE_C).commit(); - - long end = rightAfterSnapshot(); - - ExpireSnapshots.Result results = - SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - - Assert.assertEquals( - "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); - - checkExpirationResults(1L, 1L, 2L, results); - } - - @Test - public void dataFilesCleanupWithParallelTasks() throws IOException { - - table.newFastAppend().appendFile(FILE_A).commit(); - - table.newFastAppend().appendFile(FILE_B).commit(); - - table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - - table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); - - long t4 = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - Set deleteThreads = ConcurrentHashMap.newKeySet(); - AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .executeDeleteWith( - Executors.newFixedThreadPool( - 4, - runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon( - true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith( - s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService - // ThreadFactory - Assert.assertEquals( - deleteThreads, - Sets.newHashSet( - "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); - - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); - - checkExpirationResults(2L, 3L, 3L, result); - } - - @Test - public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend().appendFile(FILE_A).commit(); - - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); - checkExpirationResults(0L, 0L, 0L, results); - } - - @Test - public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend().appendFile(FILE_A).commit(); - - for (int i = 0; i < 10; i++) { - table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - - table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); - } - - long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = - SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - checkExpirationResults(1L, 39L, 20L, results); - } - - @Test - public void testRetainLastWithExpireOlderThan() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - long t1 = System.currentTimeMillis(); - while (t1 <= table.currentSnapshot().timestampMillis()) { - t1 = System.currentTimeMillis(); - } - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - long t3 = rightAfterSnapshot(); - - // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - - Assert.assertEquals( - "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - } - - @Test - public void testExpireTwoSnapshotsById() throws Exception { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - long secondSnapshotID = table.currentSnapshot().snapshotId(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - // Retain last 2 snapshots - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals( - "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals( - "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); - - checkExpirationResults(0L, 0L, 2L, result); - } - - @Test - public void testRetainLastWithExpireById() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals( - "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - checkExpirationResults(0L, 0L, 1L, result); - } - - @Test - public void testRetainLastWithTooFewSnapshots() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .appendFile(FILE_B) // data_bucket=1 - .commit(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - long t2 = rightAfterSnapshot(); - - // Retain last 3 snapshots - ExpireSnapshots.Result result = - SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); - - Assert.assertEquals( - "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals( - "First snapshot should still present", - firstSnapshotId, - table.snapshot(firstSnapshotId).snapshotId()); - checkExpirationResults(0L, 0L, 0L, result); - } - - @Test - public void testRetainLastKeepsExpiringSnapshot() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - table - .newAppend() - .appendFile(FILE_D) // data_bucket=3 - .commit(); - - // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals( - "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull( - "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); - checkExpirationResults(0L, 0L, 1L, result); - } - - @Test - public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - - table.newAppend().appendFile(FILE_A).commit(); - - AssertHelpers.assertThrows( - "Should complain about expiring snapshots", - ValidationException.class, - "Cannot expire snapshots: GC is disabled", - () -> SparkActions.get().expireSnapshots(table)); - } - - @Test - public void testExpireOlderThanMultipleCalls() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - Snapshot thirdSnapshot = table.currentSnapshot(); - - // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals( - "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull( - "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); - checkExpirationResults(0L, 0L, 2L, result); - } - - @Test - public void testRetainLastMultipleCalls() { - table - .newAppend() - .appendFile(FILE_A) // data_bucket=0 - .commit(); - - table - .newAppend() - .appendFile(FILE_B) // data_bucket=1 - .commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - - table - .newAppend() - .appendFile(FILE_C) // data_bucket=2 - .commit(); - - long t3 = rightAfterSnapshot(); - - // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals( - "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull( - "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); - checkExpirationResults(0L, 0L, 2L, result); - } - - @Test - public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows( - "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", - IllegalArgumentException.class, - "Number of snapshots to retain must be at least 1, cannot be: 0", - () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); - } - - @Test - public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - - table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - - table.newAppend().appendFile(FILE_D).commit(); - - long t3 = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - checkExpirationResults(1L, 1L, 2L, result); - } - - @Test - public void testScanExpiredManifestInValidSnapshotFastAppend() { - table - .updateProperties() - .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") - .commit(); - - table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - - table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - - table.newFastAppend().appendFile(FILE_D).commit(); - - long t3 = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - checkExpirationResults(1L, 1L, 2L, result); - } - - /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - - * C ` B (staged) - */ - @Test - public void testWithExpiringDanglingStageCommit() { - // `A` commit - table.newAppend().appendFile(FILE_A).commit(); - - // `B` staged commit - table.newAppend().appendFile(FILE_B).stageOnly().commit(); - - TableMetadata base = ((BaseTable) table).operations().current(); - Snapshot snapshotA = base.snapshots().get(0); - Snapshot snapshotB = base.snapshots().get(1); - - // `C` commit - table.newAppend().appendFile(FILE_C).commit(); - - Set deletedFiles = Sets.newHashSet(); - - // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); - - checkExpirationResults(1L, 1L, 2L, result); - - Set expectedDeletes = Sets.newHashSet(); - expectedDeletes.add(snapshotA.manifestListLocation()); - - // Files should be deleted of dangling staged snapshot - snapshotB - .addedDataFiles(table.io()) - .forEach( - i -> { - expectedDeletes.add(i.path().toString()); - }); - - // ManifestList should be deleted too - expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB - .dataManifests(table.io()) - .forEach( - file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame( - "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); - // Take the diff - expectedDeletes.removeAll(deletedFiles); - Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); - } - - /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B - * - C <--current snapshot `- D (source=B) - */ - @Test - public void testWithCherryPickTableSnapshot() { - // `A` commit - table.newAppend().appendFile(FILE_A).commit(); - Snapshot snapshotA = table.currentSnapshot(); - - // `B` commit - Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); - Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); - - // pick the snapshot 'B` - Snapshot snapshotB = table.currentSnapshot(); - - // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend().appendFile(FILE_C).commit(); - Snapshot snapshotC = table.currentSnapshot(); - - // Move the table back to `A` - table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); - - // Generate A -> `D (B)` - table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); - Snapshot snapshotD = table.currentSnapshot(); - - // Move the table back to `C` - table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); - List deletedFiles = Lists.newArrayList(); - - // Expire `C` - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); - - // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD) - .forEach( - i -> { - i.addedDataFiles(table.io()) - .forEach( - item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); - - checkExpirationResults(1L, 2L, 2L, result); - } - - /** - * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All - * commit Table: A - C - D (B) ` B (staged) - */ - @Test - public void testWithExpiringStagedThenCherrypick() { - // `A` commit - table.newAppend().appendFile(FILE_A).commit(); - - // `B` commit - table.newAppend().appendFile(FILE_B).stageOnly().commit(); - - // pick the snapshot that's staged but not committed - TableMetadata base = ((BaseTable) table).operations().current(); - Snapshot snapshotB = base.snapshots().get(1); - - // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend().appendFile(FILE_C).commit(); - - // `D (B)` cherry-pick commit - table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); - - base = ((BaseTable) table).operations().current(); - Snapshot snapshotD = base.snapshots().get(3); - - List deletedFiles = Lists.newArrayList(); - - // Expire `B` commit. - ExpireSnapshots.Result firstResult = - SparkActions.get() - .expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); - - // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB) - .forEach( - i -> { - i.addedDataFiles(table.io()) - .forEach( - item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); - checkExpirationResults(0L, 1L, 1L, firstResult); - - // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = - SparkActions.get() - .expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); - - // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD) - .forEach( - i -> { - i.addedDataFiles(table.io()) - .forEach( - item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); - checkExpirationResults(0L, 0L, 2L, secondResult); - } - - @Test - public void testExpireOlderThan() { - table.newAppend().appendFile(FILE_A).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - - rightAfterSnapshot(); - - table.newAppend().appendFile(FILE_B).commit(); - - long snapshotId = table.currentSnapshot().snapshotId(); - - long tAfterCommits = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals( - "Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), - deletedFiles); - - checkExpirationResults(0, 0, 1, result); - } - - @Test - public void testExpireOlderThanWithDelete() { - table.newAppend().appendFile(FILE_A).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); - - rightAfterSnapshot(); - - table.newDelete().deleteFile(FILE_A).commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create replace manifest with a rewritten manifest", - 1, - secondSnapshot.allManifests(table.io()).size()); - - table.newAppend().appendFile(FILE_B).commit(); - - rightAfterSnapshot(); - - long snapshotId = table.currentSnapshot().snapshotId(); - - long tAfterCommits = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the second oldest snapshot", - table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and deleted data file", - Sets.newHashSet( - firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest was rewritten for delete - secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest contained only deletes, was dropped - FILE_A.path()), // deleted - deletedFiles); - - checkExpirationResults(1, 2, 2, result); - } - - @Test - public void testExpireOlderThanWithDeleteInMergedManifests() { - // merge every commit - table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - - table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); - - rightAfterSnapshot(); - - table - .newDelete() - .deleteFile(FILE_A) // FILE_B is still in the dataset - .commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should replace manifest with a rewritten manifest", - 1, - secondSnapshot.allManifests(table.io()).size()); - - table - .newFastAppend() // do not merge to keep the last snapshot's manifest valid - .appendFile(FILE_C) - .commit(); - - rightAfterSnapshot(); - - long snapshotId = table.currentSnapshot().snapshotId(); - - long tAfterCommits = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the second oldest snapshot", - table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and deleted data file", - Sets.newHashSet( - firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot - .allManifests(table.io()) - .get(0) - .path(), // manifest was rewritten for delete - secondSnapshot.manifestListLocation(), // snapshot expired - FILE_A.path()), // deleted - deletedFiles); - - checkExpirationResults(1, 1, 2, result); - } - - @Test - public void testExpireOlderThanWithRollback() { - // merge every commit - table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - - table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); - - rightAfterSnapshot(); - - table.newDelete().deleteFile(FILE_B).commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = - Sets.newHashSet(secondSnapshot.allManifests(table.io())); - secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals( - "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - - table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); - - long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); - - long snapshotId = table.currentSnapshot().snapshotId(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNotNull( - "Expire should keep the oldest snapshot, current", - table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and reverted appended data file", - Sets.newHashSet( - secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests) - .path()), // manifest is no longer referenced - deletedFiles); - - checkExpirationResults(0, 1, 1, result); - } - - @Test - public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend().appendFile(FILE_A).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals( - "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); - - rightAfterSnapshot(); - - table.newAppend().appendFile(FILE_B).commit(); - - Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = - Sets.newHashSet(secondSnapshot.allManifests(table.io())); - secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals( - "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - - table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); - - long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); - - long snapshotId = table.currentSnapshot().snapshotId(); - - Set deletedFiles = Sets.newHashSet(); - - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals( - "Expire should not change current snapshot", - snapshotId, - table.currentSnapshot().snapshotId()); - Assert.assertNotNull( - "Expire should keep the oldest snapshot, current", - table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull( - "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals( - "Should remove expired manifest lists and reverted appended data file", - Sets.newHashSet( - secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests) - .path(), // manifest is no longer referenced - FILE_B.path()), // added, but rolled back - deletedFiles); - - checkExpirationResults(1, 1, 1, result); - } - - @Test - public void testExpireOnEmptyTable() { - Set deletedFiles = Sets.newHashSet(); - - // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); - - checkExpirationResults(0, 0, 0, result); - } - - @Test - public void testExpireAction() { - table.newAppend().appendFile(FILE_A).commit(); - - Snapshot firstSnapshot = table.currentSnapshot(); - - rightAfterSnapshot(); - - table.newAppend().appendFile(FILE_B).commit(); - - long snapshotId = table.currentSnapshot().snapshotId(); - - long tAfterCommits = rightAfterSnapshot(); - - Set deletedFiles = Sets.newHashSet(); - - BaseExpireSnapshotsSparkAction action = - (BaseExpireSnapshotsSparkAction) - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); - Dataset pendingDeletes = action.expire(); - - List pending = pendingDeletes.collectAsList(); - - Assert.assertEquals( - "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull( - "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - - Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals( - "Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), - pending.get(0).getString(0)); - Assert.assertEquals( - "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); - - Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - - Assert.assertSame( - "Multiple calls to expire should return the same deleted files", - pendingDeletes, - action.expire()); - } - - @Test - public void testUseLocalIterator() { - table.newFastAppend().appendFile(FILE_A).commit(); - - table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - - table.newFastAppend().appendFile(FILE_C).commit(); - - long end = rightAfterSnapshot(); - - int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - - ExpireSnapshots.Result results = - SparkActions.get() - .expireSnapshots(table) - .expireOlderThan(end) - .option("stream-results", "true") - .execute(); - - Assert.assertEquals( - "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); - - int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); - int totalJobsRun = jobsAfter - jobsBefore; - - checkExpirationResults(1L, 1L, 2L, results); - - Assert.assertTrue( - String.format( - "Expected more than %d jobs when using local iterator, ran %d", - SHUFFLE_PARTITIONS, totalJobsRun), - totalJobsRun > SHUFFLE_PARTITIONS); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java deleted file mode 100644 index 9369ca66171e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ /dev/null @@ -1,737 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.net.URI; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.Files; -import org.apache.iceberg.GenericBlobMetadata; -import org.apache.iceberg.GenericStatisticsFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.StatisticsFile; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hadoop.HiddenPathFilter; -import org.apache.iceberg.puffin.Blob; -import org.apache.iceberg.puffin.Puffin; -import org.apache.iceberg.puffin.PuffinWriter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.source.ThreeColumnRecord; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestRemoveOrphanFilesAction extends SparkTestBase { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - protected static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private File tableDir = null; - protected String tableLocation = null; - - @Before - public void setupTableLocation() throws Exception { - this.tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testDryRun() throws IOException, InterruptedException { - Table table = - TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - List validFiles = - spark - .read() - .format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); - Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); - - df.write().mode("append").parquet(tableLocation + "/data"); - - Path dataPath = new Path(tableLocation + "/data"); - FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = - Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); - Assert.assertEquals("Should be 3 files", 3, allFiles.size()); - - List invalidFiles = Lists.newArrayList(allFiles); - invalidFiles.removeAll(validFiles); - Assert.assertEquals("Should be 1 invalid file", 1, invalidFiles.size()); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result1 = - actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); - Assert.assertTrue( - "Default olderThan interval should be safe", - Iterables.isEmpty(result1.orphanFileLocations())); - - DeleteOrphanFiles.Result result2 = - actions - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> {}) - .execute(); - Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); - Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - - DeleteOrphanFiles.Result result3 = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse( - "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records); - expectedRecords.addAll(records); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testAllValidFilesAreKept() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - - List records1 = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); - - // original append - df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - List records2 = - Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); - Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); - - // dynamic partition overwrite - df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); - - // second append - df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - List snapshots = Lists.newArrayList(table.snapshots()); - - List snapshotFiles1 = snapshotFiles(snapshots.get(0).snapshotId()); - Assert.assertEquals(1, snapshotFiles1.size()); - - List snapshotFiles2 = snapshotFiles(snapshots.get(1).snapshotId()); - Assert.assertEquals(1, snapshotFiles2.size()); - - List snapshotFiles3 = snapshotFiles(snapshots.get(2).snapshotId()); - Assert.assertEquals(2, snapshotFiles3.size()); - - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid"); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); - - Path dataPath = new Path(tableLocation + "/data"); - FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - - for (String fileLocation : snapshotFiles1) { - Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation))); - } - - for (String fileLocation : snapshotFiles2) { - Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation))); - } - - for (String fileLocation : snapshotFiles3) { - Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation))); - } - } - - @Test - public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - - List records1 = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); - - // original append - df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - List records2 = - Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); - Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); - - // dynamic partition overwrite - df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); - - // second append - df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); - df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid"); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - Set deletedFiles = Sets.newHashSet(); - Set deleteThreads = ConcurrentHashMap.newKeySet(); - AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - - ExecutorService executorService = - Executors.newFixedThreadPool( - 4, - runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = - SparkActions.get() - .deleteOrphanFiles(table) - .executeDeleteWith(executorService) - .olderThan(System.currentTimeMillis()) - .deleteWith( - file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService - // ThreadFactory - Assert.assertEquals( - deleteThreads, - Sets.newHashSet( - "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); - - Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); - } - - @Test - public void testWapFilesAreKept() throws InterruptedException { - Map props = Maps.newHashMap(); - props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); - Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - - // normal write - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - spark.conf().set("spark.wap.id", "1"); - - // wap write - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertTrue( - "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); - } - - @Test - public void testMetadataFolderIsIntact() throws InterruptedException { - // write data directly to the table location - Map props = Maps.newHashMap(); - props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); - Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testOlderThanTimestamp() throws InterruptedException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); - df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); - - Thread.sleep(1000); - - long timestamp = System.currentTimeMillis(); - - Thread.sleep(1000); - - df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - - Assert.assertEquals( - "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); - } - - @Test - public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedException { - Map props = Maps.newHashMap(); - props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); - props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); - Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue( - "Should remove v1 file", - StreamSupport.stream(result.orphanFileLocations().spliterator(), false) - .anyMatch(file -> file.contains("v1.metadata.json"))); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records); - expectedRecords.addAll(records); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testManyTopLevelPartitions() throws InterruptedException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - - List records = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i))); - } - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertTrue( - "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testManyLeafPartitions() throws InterruptedException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - - List records = Lists.newArrayList(); - for (int i = 0; i < 100; i++) { - records.add(new ThreeColumnRecord(i, String.valueOf(i % 3), String.valueOf(i))); - } - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertTrue( - "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", records, actualRecords); - } - - private List snapshotFiles(long snapshotId) { - return spark - .read() - .format("iceberg") - .option("snapshot-id", snapshotId) - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); - } - - @Test - public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = - TABLES.create( - SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableDir.getAbsolutePath()); - - List validFiles = - spark - .read() - .format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); - Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); - String validFile = validFiles.get(0); - - df.write().mode("append").parquet(tableLocation + "/data"); - - Path dataPath = new Path(tableLocation + "/data"); - FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = - Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); - Assert.assertEquals("Should be 2 files", 2, allFiles.size()); - - List invalidFiles = Lists.newArrayList(allFiles); - invalidFiles.removeIf(file -> file.contains(validFile)); - Assert.assertEquals("Should be 1 invalid file", 1, invalidFiles.size()); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = - actions - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> {}) - .execute(); - Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); - Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - } - - @Test - public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException { - HadoopCatalog catalog = new HadoopCatalog(new Configuration(), tableLocation); - String namespaceName = "testDb"; - String tableName = "testTb"; - - Namespace namespace = Namespace.of(namespaceName); - TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = - catalog.createTable( - tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); - - df.write().mode("append").parquet(table.location() + "/data"); - - // sleep for 1 second to unsure files will be old enough - Thread.sleep(1000); - - table.refresh(); - - DeleteOrphanFiles.Result result = - SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - - Assert.assertEquals( - "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); - - Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = - resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testHiveCatalogTable() throws IOException { - Table table = - catalog.createTable( - TableIdentifier.of("default", "hivetestorphan"), - SCHEMA, - SPEC, - tableLocation, - Maps.newHashMap()); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save("default.hivetestorphan"); - - String location = table.location().replaceFirst("file:", ""); - new File(location + "/data/trashfile").createNewFile(); - - DeleteOrphanFiles.Result result = - SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000) - .execute(); - Assert.assertTrue( - "trash file should be removed", - StreamSupport.stream(result.orphanFileLocations().spliterator(), false) - .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); - } - - @Test - public void testGarbageCollectionDisabled() { - Table table = - TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - - AssertHelpers.assertThrows( - "Should complain about removing orphan files", - ValidationException.class, - "Cannot remove orphan files: GC is disabled", - () -> SparkActions.get().deleteOrphanFiles(table).execute()); - } - - @Test - public void testRemoveOrphanFilesWithStatisticFiles() throws Exception { - Table table = - TABLES.create( - SCHEMA, - PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), - tableLocation); - - List records = - Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - - table.refresh(); - long snapshotId = table.currentSnapshot().snapshotId(); - long snapshotSequenceNumber = table.currentSnapshot().sequenceNumber(); - - File statsLocation = - new File(new URI(tableLocation)) - .toPath() - .resolve("data") - .resolve("some-stats-file") - .toFile(); - StatisticsFile statisticsFile; - try (PuffinWriter puffinWriter = Puffin.write(Files.localOutput(statsLocation)).build()) { - puffinWriter.add( - new Blob( - "some-blob-type", - ImmutableList.of(1), - snapshotId, - snapshotSequenceNumber, - ByteBuffer.wrap("blob content".getBytes(StandardCharsets.UTF_8)))); - puffinWriter.finish(); - statisticsFile = - new GenericStatisticsFile( - snapshotId, - statsLocation.toString(), - puffinWriter.fileSize(), - puffinWriter.footerSize(), - puffinWriter.writtenBlobsMetadata().stream() - .map(GenericBlobMetadata::from) - .collect(ImmutableList.toImmutableList())); - } - - Transaction transaction = table.newTransaction(); - transaction.updateStatistics().setStatistics(snapshotId, statisticsFile).commit(); - transaction.commitTransaction(); - - SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000) - .execute(); - - Assertions.assertThat(statsLocation.exists()).as("stats file should exist").isTrue(); - Assertions.assertThat(statsLocation.length()) - .as("stats file length") - .isEqualTo(statisticsFile.fileSizeInBytes()); - - transaction = table.newTransaction(); - transaction.updateStatistics().removeStatistics(statisticsFile.snapshotId()).commit(); - transaction.commitTransaction(); - - DeleteOrphanFiles.Result result = - SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000) - .execute(); - Iterable orphanFileLocations = result.orphanFileLocations(); - Assertions.assertThat(orphanFileLocations).as("Should be orphan files").hasSize(1); - Assertions.assertThat(Iterables.getOnlyElement(orphanFileLocations)) - .as("Deleted file") - .isEqualTo(statsLocation.toURI().toString()); - Assertions.assertThat(statsLocation.exists()).as("stats file should be deleted").isFalse(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java deleted file mode 100644 index 86ac01eb459e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ /dev/null @@ -1,604 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.actions; - -import static org.apache.iceberg.ValidationHelpers.dataSeqs; -import static org.apache.iceberg.ValidationHelpers.fileSeqs; -import static org.apache.iceberg.ValidationHelpers.files; -import static org.apache.iceberg.ValidationHelpers.snapshotIds; -import static org.apache.iceberg.ValidationHelpers.validateDataManifest; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.RewriteManifests; -import org.apache.iceberg.exceptions.CommitStateUnknownException; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.source.ThreeColumnRecord; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.TableIdentifier; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestRewriteManifestsAction extends SparkTestBase { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - - @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") - public static Object[] parameters() { - return new Object[] {"true", "false"}; - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private final String snapshotIdInheritanceEnabled; - private String tableLocation = null; - - public TestRewriteManifestsAction(String snapshotIdInheritanceEnabled) { - this.snapshotIdInheritanceEnabled = snapshotIdInheritanceEnabled; - } - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testRewriteManifestsEmptyTable() throws IOException { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - Assert.assertNull("Table must be empty", table.currentSnapshot()); - - SparkActions actions = SparkActions.get(); - - actions - .rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); - - Assert.assertNull("Table must stay empty", table.currentSnapshot()); - } - - @Test - public void testRewriteSmallManifestsNonPartitionedTable() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size()); - - SparkActions actions = SparkActions.get(); - - RewriteManifests.Result result = - actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - - Assert.assertEquals( - "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); - - table.refresh(); - - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size()); - - Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount()); - Assert.assertFalse(newManifests.get(0).hasAddedFiles()); - Assert.assertFalse(newManifests.get(0).hasDeletedFiles()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteManifestsWithCommitStateUnknownException() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size()); - - SparkActions actions = SparkActions.get(); - - // create a spy which would throw a CommitStateUnknownException after successful commit. - org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); - org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer( - invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }) - .when(spyNewRewriteManifests) - .commit(); - - Table spyTable = spy(table); - when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - - AssertHelpers.assertThrowsCause( - "Should throw a Commit State Unknown Exception", - RuntimeException.class, - "Datacenter on Fire", - () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); - - table.refresh(); - - // table should reflect the changes, since the commit was successful - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size()); - - Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount()); - Assert.assertFalse(newManifests.get(0).hasAddedFiles()); - Assert.assertFalse(newManifests.get(0).hasDeletedFiles()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - List records3 = - Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); - writeRecords(records3); - - List records4 = - Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); - writeRecords(records4); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 4 manifests before rewrite", 4, manifests.size()); - - SparkActions actions = SparkActions.get(); - - // we will expect to have 2 manifests with 4 entries in each after rewrite - long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); - long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - - table - .updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) - .commit(); - - RewriteManifests.Result result = - actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - - Assert.assertEquals( - "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); - - table.refresh(); - - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); - - Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount()); - Assert.assertFalse(newManifests.get(0).hasAddedFiles()); - Assert.assertFalse(newManifests.get(0).hasDeletedFiles()); - - Assert.assertEquals(4, (long) newManifests.get(1).existingFilesCount()); - Assert.assertFalse(newManifests.get(1).hasAddedFiles()); - Assert.assertFalse(newManifests.get(1).hasDeletedFiles()); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - expectedRecords.addAll(records3); - expectedRecords.addAll(records4); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF - .select("c1", "c2", "c3") - .write() - .format("parquet") - .mode("overwrite") - .option("path", parquetTableLocation) - .partitionBy("c3") - .saveAsTable("parquet_table"); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - // add some more data to create more than one manifest for the rewrite - inputDF.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.refresh(); - - Snapshot snapshot = table.currentSnapshot(); - - SparkActions actions = SparkActions.get(); - - RewriteManifests.Result result = - actions - .rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); - - Assert.assertEquals( - "Action should rewrite all manifests", - snapshot.allManifests(table.io()), - result.rewrittenManifests()); - Assert.assertEquals( - "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); - - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - // all records belong to the same partition - List records = Lists.newArrayList(); - for (int i = 0; i < 50; i++) { - records.add(new ThreeColumnRecord(i, String.valueOf(i), "0")); - } - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - // repartition to create separate files - writeDF(df.repartition(50, df.col("c1"))); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); - - // set the target manifest size to a small value to force splitting records into multiple files - table - .updateProperties() - .set( - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - String.valueOf(manifests.get(0).length() / 2)) - .commit(); - - SparkActions actions = SparkActions.get(); - - RewriteManifests.Result result = - actions - .rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); - - Assert.assertEquals( - "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); - - table.refresh(); - - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - writeRecords(records1); - - List records2 = - Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); - writeRecords(records2); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 3 manifests before rewrite", 3, manifests.size()); - - SparkActions actions = SparkActions.get(); - - // rewrite only the first manifest without caching - RewriteManifests.Result result = - actions - .rewriteManifests(table) - .rewriteIf( - manifest -> - (manifest.path().equals(manifests.get(0).path()) - || (manifest.path().equals(manifests.get(1).path())))) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals( - "Action should rewrite 2 manifest", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); - - table.refresh(); - - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); - - Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertFalse( - "Second manifest must be rewritten", newManifests.contains(manifests.get(1))); - Assert.assertTrue( - "Third manifest must not be rewritten", newManifests.contains(manifests.get(2))); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.add(records1.get(0)); - expectedRecords.add(records1.get(0)); - expectedRecords.add(records1.get(1)); - expectedRecords.add(records1.get(1)); - expectedRecords.addAll(records2); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - @Test - public void testRewriteManifestsNoOp() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); - Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - - List records1 = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); - writeRecords(records1); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifest before rewrite", 1, manifests.size()); - - SparkActions actions = SparkActions.get(); - - RewriteManifests.Result result = actions.rewriteManifests(table).execute(); - - Assert.assertEquals( - "Action should rewrite 0 manifests", 0, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 0 manifests", 0, Iterables.size(result.addedManifests())); - } - - @Test - public void testRewriteSmallManifestsNonPartitionedV2Table() { - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"); - Table table = TABLES.create(SCHEMA, spec, properties, tableLocation); - - List records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA")); - writeRecords(records1); - - table.refresh(); - - Snapshot snapshot1 = table.currentSnapshot(); - DataFile file1 = Iterables.getOnlyElement(snapshot1.addedDataFiles(table.io())); - - List records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCC", "CCCC")); - writeRecords(records2); - - table.refresh(); - - Snapshot snapshot2 = table.currentSnapshot(); - DataFile file2 = Iterables.getOnlyElement(snapshot2.addedDataFiles(table.io())); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size()); - - SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table).execute(); - Assert.assertEquals( - "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals( - "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); - - table.refresh(); - - List newManifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size()); - - ManifestFile newManifest = Iterables.getOnlyElement(newManifests); - Assert.assertEquals(2, (long) newManifest.existingFilesCount()); - Assert.assertFalse(newManifest.hasAddedFiles()); - Assert.assertFalse(newManifest.hasDeletedFiles()); - - validateDataManifest( - table, - newManifest, - dataSeqs(1L, 2L), - fileSeqs(1L, 2L), - snapshotIds(snapshot1.snapshotId(), snapshot2.snapshotId()), - files(file1, file2)); - - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(records1); - expectedRecords.addAll(records2); - - Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = - resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - } - - private void writeRecords(List records) { - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - writeDF(df); - } - - private void writeDF(Dataset df) { - df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - } - - private long computeManifestEntrySizeBytes(List manifests) { - long totalSize = 0L; - int numEntries = 0; - - for (ManifestFile manifest : manifests) { - totalSize += manifest.length(); - numEntries += - manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); - } - - return totalSize / numEntries; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java deleted file mode 100644 index 2e99ca98ba16..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkSQLProperties; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.LongType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.StructType; -import org.apache.spark.sql.internal.SQLConf; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public abstract class AvroDataTest { - - protected abstract void writeAndValidate(Schema schema) throws IOException; - - protected static final StructType SUPPORTED_PRIMITIVES = - StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void testSimpleStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields()))); - } - - @Test - public void testStructWithRequiredFields() throws IOException { - writeAndValidate( - TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); - } - - @Test - public void testStructWithOptionalFields() throws IOException { - writeAndValidate( - TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); - } - - @Test - public void testNestedStruct() throws IOException { - writeAndValidate( - TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); - } - - @Test - public void testArray() throws IOException { - Schema schema = - new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); - - writeAndValidate(schema); - } - - @Test - public void testArrayOfStructs() throws IOException { - Schema schema = - TypeUtil.assignIncreasingFreshIds( - new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); - - writeAndValidate(schema); - } - - @Test - public void testMap() throws IOException { - Schema schema = - new Schema( - required(0, "id", LongType.get()), - optional( - 1, - "data", - MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); - - writeAndValidate(schema); - } - - @Test - public void testNumericMapKey() throws IOException { - Schema schema = - new Schema( - required(0, "id", LongType.get()), - optional( - 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); - - writeAndValidate(schema); - } - - @Test - public void testComplexMapKey() throws IOException { - Schema schema = - new Schema( - required(0, "id", LongType.get()), - optional( - 1, - "data", - MapType.ofOptional( - 2, - 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); - - writeAndValidate(schema); - } - - @Test - public void testMapOfStructs() throws IOException { - Schema schema = - TypeUtil.assignIncreasingFreshIds( - new Schema( - required(0, "id", LongType.get()), - optional( - 1, - "data", - MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); - - writeAndValidate(schema); - } - - @Test - public void testMixedTypes() throws IOException { - StructType structType = - StructType.of( - required(0, "id", LongType.get()), - optional( - 1, - "list_of_maps", - ListType.ofOptional( - 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), - optional( - 5, - "map_of_lists", - MapType.ofOptional( - 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required( - 9, - "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required( - 12, - "map_of_maps", - MapType.ofOptional( - 13, - 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), - required( - 17, - "list_of_struct_of_nested_types", - ListType.ofOptional( - 19, - StructType.of( - Types.NestedField.required( - 20, - "m1", - MapType.ofOptional( - 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), - Types.NestedField.optional( - 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required( - 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional( - 27, - "m2", - MapType.ofOptional( - 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); - - Schema schema = - new Schema( - TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType() - .fields()); - - writeAndValidate(schema); - } - - @Test - public void testTimestampWithoutZone() throws IOException { - withSQLConf( - ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), - () -> { - Schema schema = - TypeUtil.assignIncreasingFreshIds( - new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); - } - - protected void withSQLConf(Map conf, Action action) throws IOException { - SQLConf sqlConf = SQLConf.get(); - - Map currentConfValues = Maps.newHashMap(); - conf.keySet() - .forEach( - confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach( - (confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); - - try { - action.invoke(); - } finally { - conf.forEach( - (confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); - } - } - - @FunctionalInterface - protected interface Action { - void invoke() throws IOException; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java deleted file mode 100644 index a96e3b1f57f5..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.time.Instant; -import java.time.LocalDate; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.Collection; -import java.util.Date; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import scala.collection.Seq; - -public class GenericsHelpers { - private GenericsHelpers() {} - - private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); - private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - - public static void assertEqualsSafe(Types.StructType struct, Record expected, Row actual) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - - Object expectedValue = expected.get(i); - Object actualValue = actual.get(i); - - assertEqualsSafe(fieldType, expectedValue, actualValue); - } - } - - private static void assertEqualsSafe( - Types.ListType list, Collection expected, List actual) { - Type elementType = list.elementType(); - List expectedElements = Lists.newArrayList(expected); - for (int i = 0; i < expectedElements.size(); i += 1) { - Object expectedValue = expectedElements.get(i); - Object actualValue = actual.get(i); - - assertEqualsSafe(elementType, expectedValue, actualValue); - } - } - - private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { - Type keyType = map.keyType(); - Type valueType = map.valueType(); - Assert.assertEquals( - "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); - - for (Object expectedKey : expected.keySet()) { - Object matchingKey = null; - for (Object actualKey : actual.keySet()) { - try { - assertEqualsSafe(keyType, expectedKey, actualKey); - matchingKey = actualKey; - break; - } catch (AssertionError e) { - // failed - } - } - - Assert.assertNotNull("Should have a matching key", matchingKey); - assertEqualsSafe(valueType, expected.get(expectedKey), actual.get(matchingKey)); - } - } - - @SuppressWarnings("unchecked") - private static void assertEqualsSafe(Type type, Object expected, Object actual) { - if (expected == null && actual == null) { - return; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - break; - case DATE: - Assertions.assertThat(expected) - .as("Should expect a LocalDate") - .isInstanceOf(LocalDate.class); - Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals( - "ISO-8601 date should be equal", expected.toString(), actual.toString()); - break; - case TIMESTAMP: - Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); - Timestamp ts = (Timestamp) actual; - // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = - EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); - Types.TimestampType timestampType = (Types.TimestampType) type; - if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected) - .as("Should expect an OffsetDateTime") - .isInstanceOf(OffsetDateTime.class); - Assert.assertEquals("Timestamp should be equal", expected, actualTs); - } else { - Assertions.assertThat(expected) - .as("Should expect an LocalDateTime") - .isInstanceOf(LocalDateTime.class); - Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); - } - break; - case STRING: - Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("Strings should be equal", String.valueOf(expected), actual); - break; - case UUID: - Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", expected.toString(), actual); - break; - case FIXED: - Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); - break; - case BINARY: - Assertions.assertThat(expected) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); - break; - case DECIMAL: - Assertions.assertThat(expected) - .as("Should expect a BigDecimal") - .isInstanceOf(BigDecimal.class); - Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); - Assert.assertEquals("BigDecimals should be equal", expected, actual); - break; - case STRUCT: - Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be a Row").isInstanceOf(Row.class); - assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); - break; - case LIST: - Assertions.assertThat(expected) - .as("Should expect a Collection") - .isInstanceOf(Collection.class); - Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); - List asList = seqAsJavaListConverter((Seq) actual).asJava(); - assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); - break; - case MAP: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual) - .as("Should be a Map") - .isInstanceOf(scala.collection.Map.class); - Map asMap = - mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); - assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); - break; - case TIME: - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - public static void assertEqualsUnsafe( - Types.StructType struct, Record expected, InternalRow actual) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - - Object expectedValue = expected.get(i); - Object actualValue = actual.get(i, convert(fieldType)); - - assertEqualsUnsafe(fieldType, expectedValue, actualValue); - } - } - - private static void assertEqualsUnsafe( - Types.ListType list, Collection expected, ArrayData actual) { - Type elementType = list.elementType(); - List expectedElements = Lists.newArrayList(expected); - for (int i = 0; i < expectedElements.size(); i += 1) { - Object expectedValue = expectedElements.get(i); - Object actualValue = actual.get(i, convert(elementType)); - - assertEqualsUnsafe(elementType, expectedValue, actualValue); - } - } - - private static void assertEqualsUnsafe(Types.MapType map, Map expected, MapData actual) { - Type keyType = map.keyType(); - Type valueType = map.valueType(); - - List> expectedElements = Lists.newArrayList(expected.entrySet()); - ArrayData actualKeys = actual.keyArray(); - ArrayData actualValues = actual.valueArray(); - - for (int i = 0; i < expectedElements.size(); i += 1) { - Map.Entry expectedPair = expectedElements.get(i); - Object actualKey = actualKeys.get(i, convert(keyType)); - Object actualValue = actualValues.get(i, convert(keyType)); - - assertEqualsUnsafe(keyType, expectedPair.getKey(), actualKey); - assertEqualsUnsafe(valueType, expectedPair.getValue(), actualValue); - } - } - - private static void assertEqualsUnsafe(Type type, Object expected, Object actual) { - if (expected == null && actual == null) { - return; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - break; - case DATE: - Assertions.assertThat(expected) - .as("Should expect a LocalDate") - .isInstanceOf(LocalDate.class); - int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); - break; - case TIMESTAMP: - Types.TimestampType timestampType = (Types.TimestampType) type; - if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected) - .as("Should expect an OffsetDateTime") - .isInstanceOf(OffsetDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals( - "Primitive value should be equal to expected", expectedMicros, actual); - } else { - Assertions.assertThat(expected) - .as("Should expect an LocalDateTime") - .isInstanceOf(LocalDateTime.class); - long expectedMicros = - ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals( - "Primitive value should be equal to expected", expectedMicros, actual); - } - break; - case STRING: - Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("Strings should be equal", expected, actual.toString()); - break; - case UUID: - Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals( - "UUID string representation should match", expected.toString(), actual.toString()); - break; - case FIXED: - Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); - break; - case BINARY: - Assertions.assertThat(expected) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); - break; - case DECIMAL: - Assertions.assertThat(expected) - .as("Should expect a BigDecimal") - .isInstanceOf(BigDecimal.class); - Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals( - "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); - break; - case STRUCT: - Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual) - .as("Should be an InternalRow") - .isInstanceOf(InternalRow.class); - assertEqualsUnsafe( - type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); - break; - case LIST: - Assertions.assertThat(expected) - .as("Should expect a Collection") - .isInstanceOf(Collection.class); - Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe( - type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); - break; - case MAP: - Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual) - .as("Should be an ArrayBasedMapData") - .isInstanceOf(MapData.class); - assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); - break; - case TIME: - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java deleted file mode 100644 index 1c95df8ced12..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Random; -import java.util.Set; -import java.util.UUID; -import java.util.function.Supplier; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericData.Record; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.RandomUtil; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; -import org.apache.spark.sql.catalyst.util.GenericArrayData; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.types.UTF8String; - -public class RandomData { - - // Default percentage of number of values that are null for optional fields - public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - - private RandomData() {} - - public static List generateList(Schema schema, int numRecords, long seed) { - RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); - List records = Lists.newArrayListWithExpectedSize(numRecords); - for (int i = 0; i < numRecords; i += 1) { - records.add((Record) TypeUtil.visit(schema, generator)); - } - - return records; - } - - public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> - new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; - } - - public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable( - () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); - } - - public static Iterable generate( - Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable( - () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); - } - - public static Iterable generateFallbackData( - Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable( - () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); - } - - public static Iterable generateDictionaryEncodableData( - Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable( - () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); - } - - private static Iterable newIterable( - Supplier newGenerator, Schema schema, int numRecords) { - return () -> - new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; - } - - private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { - private final Map typeToSchema; - private final Random random; - // Percentage of number of values that are null for optional fields - private final float nullPercentage; - - private RandomDataGenerator(Schema schema, long seed, float nullPercentage) { - Preconditions.checkArgument( - 0.0f <= nullPercentage && nullPercentage <= 1.0f, - "Percentage needs to be in the range (0.0, 1.0)"); - this.nullPercentage = nullPercentage; - this.typeToSchema = AvroSchemaUtil.convertTypes(schema.asStruct(), "test"); - this.random = new Random(seed); - } - - @Override - public Record schema(Schema schema, Supplier structResult) { - return (Record) structResult.get(); - } - - @Override - public Record struct(Types.StructType struct, Iterable fieldResults) { - Record rec = new Record(typeToSchema.get(struct)); - - List values = Lists.newArrayList(fieldResults); - for (int i = 0; i < values.size(); i += 1) { - rec.put(i, values.get(i)); - } - - return rec; - } - - @Override - public Object field(Types.NestedField field, Supplier fieldResult) { - if (field.isOptional() && isNull()) { - return null; - } - return fieldResult.get(); - } - - private boolean isNull() { - return random.nextFloat() < nullPercentage; - } - - @Override - public Object list(Types.ListType list, Supplier elementResult) { - int numElements = random.nextInt(20); - - List result = Lists.newArrayListWithExpectedSize(numElements); - for (int i = 0; i < numElements; i += 1) { - if (list.isElementOptional() && isNull()) { - result.add(null); - } else { - result.add(elementResult.get()); - } - } - - return result; - } - - @Override - public Object map(Types.MapType map, Supplier keyResult, Supplier valueResult) { - int numEntries = random.nextInt(20); - - Map result = Maps.newLinkedHashMap(); - Set keySet = Sets.newHashSet(); - for (int i = 0; i < numEntries; i += 1) { - Object key = keyResult.get(); - // ensure no collisions - while (keySet.contains(key)) { - key = keyResult.get(); - } - - keySet.add(key); - - if (map.isValueOptional() && isNull()) { - result.put(key, null); - } else { - result.put(key, valueResult.get()); - } - } - - return result; - } - - @Override - public Object primitive(Type.PrimitiveType primitive) { - Object result = randomValue(primitive, random); - // For the primitives that Avro needs a different type than Spark, fix - // them here. - switch (primitive.typeId()) { - case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); - case BINARY: - return ByteBuffer.wrap((byte[]) result); - case UUID: - return UUID.nameUUIDFromBytes((byte[]) result); - default: - return result; - } - } - - protected Object randomValue(Type.PrimitiveType primitive, Random rand) { - return RandomUtil.generatePrimitive(primitive, random); - } - } - - private static class SparkRandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { - private final Random random; - - private SparkRandomDataGenerator(long seed) { - this.random = new Random(seed); - } - - @Override - public InternalRow schema(Schema schema, Supplier structResult) { - return (InternalRow) structResult.get(); - } - - @Override - public InternalRow struct(Types.StructType struct, Iterable fieldResults) { - List values = Lists.newArrayList(fieldResults); - GenericInternalRow row = new GenericInternalRow(values.size()); - for (int i = 0; i < values.size(); i += 1) { - row.update(i, values.get(i)); - } - - return row; - } - - @Override - public Object field(Types.NestedField field, Supplier fieldResult) { - // return null 5% of the time when the value is optional - if (field.isOptional() && random.nextInt(20) == 1) { - return null; - } - return fieldResult.get(); - } - - @Override - public GenericArrayData list(Types.ListType list, Supplier elementResult) { - int numElements = random.nextInt(20); - Object[] arr = new Object[numElements]; - GenericArrayData result = new GenericArrayData(arr); - - for (int i = 0; i < numElements; i += 1) { - // return null 5% of the time when the value is optional - if (list.isElementOptional() && random.nextInt(20) == 1) { - arr[i] = null; - } else { - arr[i] = elementResult.get(); - } - } - - return result; - } - - @Override - public Object map(Types.MapType map, Supplier keyResult, Supplier valueResult) { - int numEntries = random.nextInt(20); - - Object[] keysArr = new Object[numEntries]; - Object[] valuesArr = new Object[numEntries]; - GenericArrayData keys = new GenericArrayData(keysArr); - GenericArrayData values = new GenericArrayData(valuesArr); - ArrayBasedMapData result = new ArrayBasedMapData(keys, values); - - Set keySet = Sets.newHashSet(); - for (int i = 0; i < numEntries; i += 1) { - Object key = keyResult.get(); - // ensure no collisions - while (keySet.contains(key)) { - key = keyResult.get(); - } - - keySet.add(key); - - keysArr[i] = key; - // return null 5% of the time when the value is optional - if (map.isValueOptional() && random.nextInt(20) == 1) { - valuesArr[i] = null; - } else { - valuesArr[i] = valueResult.get(); - } - } - - return result; - } - - @Override - public Object primitive(Type.PrimitiveType primitive) { - Object obj = RandomUtil.generatePrimitive(primitive, random); - switch (primitive.typeId()) { - case STRING: - return UTF8String.fromString((String) obj); - case DECIMAL: - return Decimal.apply((BigDecimal) obj); - default: - return obj; - } - } - } - - private static class DictionaryEncodedDataGenerator extends RandomDataGenerator { - private DictionaryEncodedDataGenerator(Schema schema, long seed, float nullPercentage) { - super(schema, seed, nullPercentage); - } - - @Override - protected Object randomValue(Type.PrimitiveType primitive, Random random) { - return RandomUtil.generateDictionaryEncodablePrimitive(primitive, random); - } - } - - private static class FallbackDataGenerator extends RandomDataGenerator { - private final long dictionaryEncodedRows; - private long rowCount = 0; - - private FallbackDataGenerator(Schema schema, long seed, long numDictionaryEncoded) { - super(schema, seed, DEFAULT_NULL_PERCENTAGE); - this.dictionaryEncodedRows = numDictionaryEncoded; - } - - @Override - protected Object randomValue(Type.PrimitiveType primitive, Random rand) { - this.rowCount += 1; - if (rowCount > dictionaryEncodedRows) { - return RandomUtil.generatePrimitive(primitive, rand); - } else { - return RandomUtil.generateDictionaryEncodablePrimitive(primitive, rand); - } - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java deleted file mode 100644 index 42f4c1a1ab42..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ /dev/null @@ -1,770 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.time.Instant; -import java.time.LocalDate; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.time.temporal.ChronoUnit; -import java.util.Collection; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import org.apache.arrow.vector.ValueVector; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericData.Record; -import org.apache.iceberg.Schema; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.apache.orc.storage.serde2.io.DateWritable; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericRow; -import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.DateTimeUtils; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.BinaryType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.sql.types.MapType; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.vectorized.ColumnVector; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.apache.spark.unsafe.types.UTF8String; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import scala.collection.Seq; - -public class TestHelpers { - - private TestHelpers() {} - - public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - - Object expectedValue = rec.get(i); - Object actualValue = row.get(i); - - assertEqualsSafe(fieldType, expectedValue, actualValue); - } - } - - public static void assertEqualsBatch( - Types.StructType struct, - Iterator expected, - ColumnarBatch batch, - boolean checkArrowValidityVector) { - for (int rowId = 0; rowId < batch.numRows(); rowId++) { - List fields = struct.fields(); - InternalRow row = batch.getRow(rowId); - Record rec = expected.next(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - Object expectedValue = rec.get(i); - Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType)); - assertEqualsUnsafe(fieldType, expectedValue, actualValue); - - if (checkArrowValidityVector) { - ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = - ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse( - "Nullability doesn't match of " + columnVector.dataType(), - expectedValue == null ^ arrowVector.isNull(rowId)); - } - } - } - } - - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { - Type elementType = list.elementType(); - List expectedElements = Lists.newArrayList(expected); - for (int i = 0; i < expectedElements.size(); i += 1) { - Object expectedValue = expectedElements.get(i); - Object actualValue = actual.get(i); - - assertEqualsSafe(elementType, expectedValue, actualValue); - } - } - - private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { - Type keyType = map.keyType(); - Type valueType = map.valueType(); - - for (Object expectedKey : expected.keySet()) { - Object matchingKey = null; - for (Object actualKey : actual.keySet()) { - try { - assertEqualsSafe(keyType, expectedKey, actualKey); - matchingKey = actualKey; - } catch (AssertionError e) { - // failed - } - } - - Assert.assertNotNull("Should have a matching key", matchingKey); - assertEqualsSafe(valueType, expected.get(expectedKey), actual.get(matchingKey)); - } - } - - private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); - private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - - @SuppressWarnings("unchecked") - private static void assertEqualsSafe(Type type, Object expected, Object actual) { - if (expected == null && actual == null) { - return; - } - - switch (type.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - break; - case DATE: - Assertions.assertThat(expected).as("Should be an int").isInstanceOf(Integer.class); - Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - int daysFromEpoch = (Integer) expected; - LocalDate date = ChronoUnit.DAYS.addTo(EPOCH_DAY, daysFromEpoch); - Assert.assertEquals("ISO-8601 date should be equal", date.toString(), actual.toString()); - break; - case TIMESTAMP: - Assertions.assertThat(expected).as("Should be a long").isInstanceOf(Long.class); - Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); - Timestamp ts = (Timestamp) actual; - // milliseconds from nanos has already been added by getTime - long tsMicros = (ts.getTime() * 1000) + ((ts.getNanos() / 1000) % 1000); - Assert.assertEquals("Timestamp micros should be equal", expected, tsMicros); - break; - case STRING: - Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("Strings should be equal", String.valueOf(expected), actual); - break; - case UUID: - Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", expected.toString(), actual); - break; - case FIXED: - Assertions.assertThat(expected) - .as("Should expect a Fixed") - .isInstanceOf(GenericData.Fixed.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); - break; - case BINARY: - Assertions.assertThat(expected) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); - break; - case DECIMAL: - Assertions.assertThat(expected) - .as("Should expect a BigDecimal") - .isInstanceOf(BigDecimal.class); - Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); - Assert.assertEquals("BigDecimals should be equal", expected, actual); - break; - case STRUCT: - Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be a Row").isInstanceOf(Row.class); - assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); - break; - case LIST: - Assertions.assertThat(expected) - .as("Should expect a Collection") - .isInstanceOf(Collection.class); - Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); - List asList = seqAsJavaListConverter((Seq) actual).asJava(); - assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); - break; - case MAP: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual) - .as("Should be a Map") - .isInstanceOf(scala.collection.Map.class); - Map asMap = - mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); - assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); - break; - case TIME: - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - public static void assertEqualsUnsafe(Types.StructType struct, Record rec, InternalRow row) { - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Type fieldType = fields.get(i).type(); - - Object expectedValue = rec.get(i); - Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType)); - - assertEqualsUnsafe(fieldType, expectedValue, actualValue); - } - } - - private static void assertEqualsUnsafe( - Types.ListType list, Collection expected, ArrayData actual) { - Type elementType = list.elementType(); - List expectedElements = Lists.newArrayList(expected); - for (int i = 0; i < expectedElements.size(); i += 1) { - Object expectedValue = expectedElements.get(i); - Object actualValue = actual.get(i, convert(elementType)); - - assertEqualsUnsafe(elementType, expectedValue, actualValue); - } - } - - private static void assertEqualsUnsafe(Types.MapType map, Map expected, MapData actual) { - Type keyType = map.keyType(); - Type valueType = map.valueType(); - - List> expectedElements = Lists.newArrayList(expected.entrySet()); - ArrayData actualKeys = actual.keyArray(); - ArrayData actualValues = actual.valueArray(); - - for (int i = 0; i < expectedElements.size(); i += 1) { - Map.Entry expectedPair = expectedElements.get(i); - Object actualKey = actualKeys.get(i, convert(keyType)); - Object actualValue = actualValues.get(i, convert(keyType)); - - assertEqualsUnsafe(keyType, expectedPair.getKey(), actualKey); - assertEqualsUnsafe(valueType, expectedPair.getValue(), actualValue); - } - } - - private static void assertEqualsUnsafe(Type type, Object expected, Object actual) { - if (expected == null && actual == null) { - return; - } - - switch (type.typeId()) { - case LONG: - Assertions.assertThat(actual).as("Should be a long").isInstanceOf(Long.class); - if (expected instanceof Integer) { - Assert.assertEquals("Values didn't match", ((Number) expected).longValue(), actual); - } else { - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - } - break; - case DOUBLE: - Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); - if (expected instanceof Float) { - Assert.assertEquals( - "Values didn't match", - Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); - } else { - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - } - break; - case INTEGER: - case FLOAT: - case BOOLEAN: - case DATE: - case TIMESTAMP: - Assert.assertEquals("Primitive value should be equal to expected", expected, actual); - break; - case STRING: - Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("Strings should be equal", expected, actual.toString()); - break; - case UUID: - Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals( - "UUID string representation should match", expected.toString(), actual.toString()); - break; - case FIXED: - Assertions.assertThat(expected) - .as("Should expect a Fixed") - .isInstanceOf(GenericData.Fixed.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); - break; - case BINARY: - Assertions.assertThat(expected) - .as("Should expect a ByteBuffer") - .isInstanceOf(ByteBuffer.class); - Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals( - "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); - break; - case DECIMAL: - Assertions.assertThat(expected) - .as("Should expect a BigDecimal") - .isInstanceOf(BigDecimal.class); - Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals( - "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); - break; - case STRUCT: - Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual) - .as("Should be an InternalRow") - .isInstanceOf(InternalRow.class); - assertEqualsUnsafe( - type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); - break; - case LIST: - Assertions.assertThat(expected) - .as("Should expect a Collection") - .isInstanceOf(Collection.class); - Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe( - type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); - break; - case MAP: - Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual) - .as("Should be an ArrayBasedMapData") - .isInstanceOf(MapData.class); - assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); - break; - case TIME: - default: - throw new IllegalArgumentException("Not a supported type: " + type); - } - } - - /** - * Check that the given InternalRow is equivalent to the Row. - * - * @param prefix context for error messages - * @param type the type of the row - * @param expected the expected value of the row - * @param actual the actual value of the row - */ - public static void assertEquals( - String prefix, Types.StructType type, InternalRow expected, Row actual) { - if (expected == null || actual == null) { - Assert.assertEquals(prefix, expected, actual); - } else { - List fields = type.fields(); - for (int c = 0; c < fields.size(); ++c) { - String fieldName = fields.get(c).name(); - Type childType = fields.get(c).type(); - switch (childType.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case DECIMAL: - case DATE: - case TIMESTAMP: - Assert.assertEquals( - prefix + "." + fieldName + " - " + childType, - getValue(expected, c, childType), - getPrimitiveValue(actual, c, childType)); - break; - case UUID: - case FIXED: - case BINARY: - assertEqualBytes( - prefix + "." + fieldName, - (byte[]) getValue(expected, c, childType), - (byte[]) actual.get(c)); - break; - case STRUCT: - { - Types.StructType st = (Types.StructType) childType; - assertEquals( - prefix + "." + fieldName, - st, - expected.getStruct(c, st.fields().size()), - actual.getStruct(c)); - break; - } - case LIST: - assertEqualsLists( - prefix + "." + fieldName, - childType.asListType(), - expected.getArray(c), - toList((Seq) actual.get(c))); - break; - case MAP: - assertEqualsMaps( - prefix + "." + fieldName, - childType.asMapType(), - expected.getMap(c), - toJavaMap((scala.collection.Map) actual.getMap(c))); - break; - default: - throw new IllegalArgumentException("Unhandled type " + childType); - } - } - } - } - - private static void assertEqualsLists( - String prefix, Types.ListType type, ArrayData expected, List actual) { - if (expected == null || actual == null) { - Assert.assertEquals(prefix, expected, actual); - } else { - Assert.assertEquals(prefix + " length", expected.numElements(), actual.size()); - Type childType = type.elementType(); - for (int e = 0; e < expected.numElements(); ++e) { - switch (childType.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case DECIMAL: - case DATE: - case TIMESTAMP: - Assert.assertEquals( - prefix + ".elem " + e + " - " + childType, - getValue(expected, e, childType), - actual.get(e)); - break; - case UUID: - case FIXED: - case BINARY: - assertEqualBytes( - prefix + ".elem " + e, - (byte[]) getValue(expected, e, childType), - (byte[]) actual.get(e)); - break; - case STRUCT: - { - Types.StructType st = (Types.StructType) childType; - assertEquals( - prefix + ".elem " + e, - st, - expected.getStruct(e, st.fields().size()), - (Row) actual.get(e)); - break; - } - case LIST: - assertEqualsLists( - prefix + ".elem " + e, - childType.asListType(), - expected.getArray(e), - toList((Seq) actual.get(e))); - break; - case MAP: - assertEqualsMaps( - prefix + ".elem " + e, - childType.asMapType(), - expected.getMap(e), - toJavaMap((scala.collection.Map) actual.get(e))); - break; - default: - throw new IllegalArgumentException("Unhandled type " + childType); - } - } - } - } - - private static void assertEqualsMaps( - String prefix, Types.MapType type, MapData expected, Map actual) { - if (expected == null || actual == null) { - Assert.assertEquals(prefix, expected, actual); - } else { - Type keyType = type.keyType(); - Type valueType = type.valueType(); - ArrayData expectedKeyArray = expected.keyArray(); - ArrayData expectedValueArray = expected.valueArray(); - Assert.assertEquals(prefix + " length", expected.numElements(), actual.size()); - for (int e = 0; e < expected.numElements(); ++e) { - Object expectedKey = getValue(expectedKeyArray, e, keyType); - Object actualValue = actual.get(expectedKey); - if (actualValue == null) { - Assert.assertEquals( - prefix + ".key=" + expectedKey + " has null", - true, - expected.valueArray().isNullAt(e)); - } else { - switch (valueType.typeId()) { - case BOOLEAN: - case INTEGER: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case DECIMAL: - case DATE: - case TIMESTAMP: - Assert.assertEquals( - prefix + ".key=" + expectedKey + " - " + valueType, - getValue(expectedValueArray, e, valueType), - actual.get(expectedKey)); - break; - case UUID: - case FIXED: - case BINARY: - assertEqualBytes( - prefix + ".key=" + expectedKey, - (byte[]) getValue(expectedValueArray, e, valueType), - (byte[]) actual.get(expectedKey)); - break; - case STRUCT: - { - Types.StructType st = (Types.StructType) valueType; - assertEquals( - prefix + ".key=" + expectedKey, - st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } - case LIST: - assertEqualsLists( - prefix + ".key=" + expectedKey, - valueType.asListType(), - expectedValueArray.getArray(e), - toList((Seq) actual.get(expectedKey))); - break; - case MAP: - assertEqualsMaps( - prefix + ".key=" + expectedKey, - valueType.asMapType(), - expectedValueArray.getMap(e), - toJavaMap((scala.collection.Map) actual.get(expectedKey))); - break; - default: - throw new IllegalArgumentException("Unhandled type " + valueType); - } - } - } - } - } - - private static Object getValue(SpecializedGetters container, int ord, Type type) { - if (container.isNullAt(ord)) { - return null; - } - switch (type.typeId()) { - case BOOLEAN: - return container.getBoolean(ord); - case INTEGER: - return container.getInt(ord); - case LONG: - return container.getLong(ord); - case FLOAT: - return container.getFloat(ord); - case DOUBLE: - return container.getDouble(ord); - case STRING: - return container.getUTF8String(ord).toString(); - case BINARY: - case FIXED: - case UUID: - return container.getBinary(ord); - case DATE: - return new DateWritable(container.getInt(ord)).get(); - case TIMESTAMP: - return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: - { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } - case STRUCT: - Types.StructType struct = type.asStructType(); - InternalRow internalRow = container.getStruct(ord, struct.fields().size()); - Object[] data = new Object[struct.fields().size()]; - for (int i = 0; i < data.length; i += 1) { - if (internalRow.isNullAt(i)) { - data[i] = null; - } else { - data[i] = getValue(internalRow, i, struct.fields().get(i).type()); - } - } - return new GenericRow(data); - default: - throw new IllegalArgumentException("Unhandled type " + type); - } - } - - private static Object getPrimitiveValue(Row row, int ord, Type type) { - if (row.isNullAt(ord)) { - return null; - } - switch (type.typeId()) { - case BOOLEAN: - return row.getBoolean(ord); - case INTEGER: - return row.getInt(ord); - case LONG: - return row.getLong(ord); - case FLOAT: - return row.getFloat(ord); - case DOUBLE: - return row.getDouble(ord); - case STRING: - return row.getString(ord); - case BINARY: - case FIXED: - case UUID: - return row.get(ord); - case DATE: - return row.getDate(ord); - case TIMESTAMP: - return row.getTimestamp(ord); - case DECIMAL: - return row.getDecimal(ord); - default: - throw new IllegalArgumentException("Unhandled type " + type); - } - } - - private static Map toJavaMap(scala.collection.Map map) { - return map == null ? null : mapAsJavaMapConverter(map).asJava(); - } - - private static List toList(Seq val) { - return val == null ? null : seqAsJavaListConverter(val).asJava(); - } - - private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { - if (expected == null || actual == null) { - Assert.assertEquals(context, expected, actual); - } else { - Assert.assertArrayEquals(context, expected, actual); - } - } - - static void assertEquals(Schema schema, Object expected, Object actual) { - assertEquals("schema", convert(schema), expected, actual); - } - - private static void assertEquals(String context, DataType type, Object expected, Object actual) { - if (expected == null && actual == null) { - return; - } - - if (type instanceof StructType) { - Assertions.assertThat(expected) - .as("Expected should be an InternalRow: " + context) - .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual) - .as("Actual should be an InternalRow: " + context) - .isInstanceOf(InternalRow.class); - assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); - - } else if (type instanceof ArrayType) { - Assertions.assertThat(expected) - .as("Expected should be an ArrayData: " + context) - .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual) - .as("Actual should be an ArrayData: " + context) - .isInstanceOf(ArrayData.class); - assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); - - } else if (type instanceof MapType) { - Assertions.assertThat(expected) - .as("Expected should be a MapData: " + context) - .isInstanceOf(MapData.class); - Assertions.assertThat(actual) - .as("Actual should be a MapData: " + context) - .isInstanceOf(MapData.class); - assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); - - } else if (type instanceof BinaryType) { - assertEqualBytes(context, (byte[]) expected, (byte[]) actual); - } else { - Assert.assertEquals("Value should match expected: " + context, expected, actual); - } - } - - private static void assertEquals( - String context, StructType struct, InternalRow expected, InternalRow actual) { - Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); - for (int i = 0; i < actual.numFields(); i += 1) { - StructField field = struct.fields()[i]; - DataType type = field.dataType(); - assertEquals( - context + "." + field.name(), - type, - expected.isNullAt(i) ? null : expected.get(i, type), - actual.isNullAt(i) ? null : actual.get(i, type)); - } - } - - private static void assertEquals( - String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals( - "Should have the same number of elements", expected.numElements(), actual.numElements()); - DataType type = array.elementType(); - for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals( - context + ".element", - type, - expected.isNullAt(i) ? null : expected.get(i, type), - actual.isNullAt(i) ? null : actual.get(i, type)); - } - } - - private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals( - "Should have the same number of elements", expected.numElements(), actual.numElements()); - - DataType keyType = map.keyType(); - ArrayData expectedKeys = expected.keyArray(); - ArrayData expectedValues = expected.valueArray(); - - DataType valueType = map.valueType(); - ArrayData actualKeys = actual.keyArray(); - ArrayData actualValues = actual.valueArray(); - - for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals( - context + ".key", - keyType, - expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), - actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals( - context + ".value", - valueType, - expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), - actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java deleted file mode 100644 index 1e51a088390e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestOrcWrite { - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - @Test - public void splitOffsets() throws IOException { - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = - ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); - - writer.addAll(rows); - writer.close(); - Assert.assertNotNull("Split offsets not present", writer.splitOffsets()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java deleted file mode 100644 index a4ffc2fea437..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.avro.generic.GenericData.Record; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetAvroValueReaders; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.types.Types; -import org.apache.parquet.schema.MessageType; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestParquetAvroReader { - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = - new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required( - 5, - "strict", - Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional( - 6, - "hopeful", - Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()))), - optional(10, "vehement", Types.LongType.get()))), - optional( - 11, - "metamorphosis", - Types.MapType.ofRequired( - 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), - required( - 14, - "winter", - Types.ListType.ofOptional( - 15, - Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get())))), - optional( - 19, - "renovate", - Types.MapType.ofRequired( - 20, - 21, - Types.StringType.get(), - Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get())))), - optional(2, "slide", Types.StringType.get())); - - @Ignore - public void testStructSchema() throws IOException { - Schema structSchema = - new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required( - 4, - "quell", - Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()))), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get())); - - File testFile = writeTestData(structSchema, 5_000_000, 1059); - // RandomData uses the root record name "test", which must match for records to be equal - MessageType readSchema = ParquetSchemaUtil.convert(structSchema, "test"); - - long sum = 0; - long sumSq = 0; - int warmups = 2; - int trials = 10; - - for (int i = 0; i < warmups + trials; i += 1) { - // clean up as much memory as possible to avoid a large GC during the timed run - System.gc(); - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { - long start = System.currentTimeMillis(); - long val = 0; - long count = 0; - for (Record record : reader) { - // access something to ensure the compiler doesn't optimize this away - val ^= (Long) record.get(0); - count += 1; - } - long end = System.currentTimeMillis(); - long duration = end - start; - - if (i >= warmups) { - sum += duration; - sumSq += duration * duration; - } - } - } - - double mean = ((double) sum) / trials; - double stddev = Math.sqrt((((double) sumSq) / trials) - (mean * mean)); - } - - @Ignore - public void testWithOldReadPath() throws IOException { - File testFile = writeTestData(COMPLEX_SCHEMA, 500_000, 1985); - // RandomData uses the root record name "test", which must match for records to be equal - MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); - - for (int i = 0; i < 5; i += 1) { - // clean up as much memory as possible to avoid a large GC during the timed run - System.gc(); - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { - long start = System.currentTimeMillis(); - long val = 0; - long count = 0; - for (Record record : reader) { - // access something to ensure the compiler doesn't optimize this away - val ^= (Long) record.get(0); - count += 1; - } - long end = System.currentTimeMillis(); - } - - // clean up as much memory as possible to avoid a large GC during the timed run - System.gc(); - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { - long start = System.currentTimeMillis(); - long val = 0; - long count = 0; - for (Record record : reader) { - // access something to ensure the compiler doesn't optimize this away - val ^= (Long) record.get(0); - count += 1; - } - long end = System.currentTimeMillis(); - } - } - } - - @Test - public void testCorrectness() throws IOException { - Iterable records = RandomData.generate(COMPLEX_SCHEMA, 50_000, 34139); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { - writer.addAll(records); - } - - // RandomData uses the root record name "test", which must match for records to be equal - MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); - - // verify that the new read path is correct - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { - int recordNum = 0; - Iterator iter = records.iterator(); - for (Record actual : reader) { - Record expected = iter.next(); - Assert.assertEquals("Record " + recordNum + " should match expected", expected, actual); - recordNum += 1; - } - } - } - - private File writeTestData(Schema schema, int numRecords, int seed) throws IOException { - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { - writer.addAll(RandomData.generate(schema, numRecords, seed)); - } - - return testFile; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java deleted file mode 100644 index 15c6268da478..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.avro.generic.GenericData.Record; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetAvroValueReaders; -import org.apache.iceberg.parquet.ParquetAvroWriter; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.types.Types; -import org.apache.parquet.schema.MessageType; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestParquetAvroWriter { - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = - new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required( - 5, - "strict", - Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional( - 6, - "hopeful", - Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()))), - optional(10, "vehement", Types.LongType.get()))), - optional( - 11, - "metamorphosis", - Types.MapType.ofRequired( - 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), - required( - 14, - "winter", - Types.ListType.ofOptional( - 15, - Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get())))), - optional( - 19, - "renovate", - Types.MapType.ofRequired( - 20, - 21, - Types.StringType.get(), - Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get())))), - optional(2, "slide", Types.StringType.get())); - - @Test - public void testCorrectness() throws IOException { - Iterable records = RandomData.generate(COMPLEX_SCHEMA, 50_000, 34139); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { - writer.addAll(records); - } - - // RandomData uses the root record name "test", which must match for records to be equal - MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); - - // verify that the new read path is correct - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { - int recordNum = 0; - Iterator iter = records.iterator(); - for (Record actual : reader) { - Record expected = iter.next(); - Assert.assertEquals("Record " + recordNum + " should match expected", expected, actual); - recordNum += 1; - } - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java deleted file mode 100644 index 6f05a9ed7c1f..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericData.Record; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.avro.AvroIterable; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkAvroEnums { - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = - SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); - - org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); - Record enumRecord1 = new GenericData.Record(avroSchema); - enumRecord1.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB1")); - Record enumRecord2 = new GenericData.Record(avroSchema); - enumRecord2.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB2")); - Record enumRecord3 = new GenericData.Record(avroSchema); // null enum - List expected = ImmutableList.of(enumRecord1, enumRecord2, enumRecord3); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (DataFileWriter writer = new DataFileWriter<>(new GenericDatumWriter<>())) { - writer.create(avroSchema, testFile); - writer.append(enumRecord1); - writer.append(enumRecord2); - writer.append(enumRecord3); - } - - Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); - List rows; - try (AvroIterable reader = - Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { - rows = Lists.newArrayList(reader); - } - - // Iceberg will return enums as strings, so we compare string values for the enum field - for (int i = 0; i < expected.size(); i += 1) { - String expectedEnumString = - expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = - rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); - Assert.assertEquals(expectedEnumString, sparkString); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java deleted file mode 100644 index 6d1ef3db3657..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.avro.generic.GenericData.Record; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.avro.AvroIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; - -public class TestSparkAvroReader extends AvroDataTest { - @Override - protected void writeAndValidate(Schema schema) throws IOException { - List expected = RandomData.generateList(schema, 100, 0L); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { - for (Record rec : expected) { - writer.add(rec); - } - } - - List rows; - try (AvroIterable reader = - Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { - rows = Lists.newArrayList(reader); - } - - for (int i = 0; i < expected.size(); i += 1) { - assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i)); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java deleted file mode 100644 index 56f3cf3c5d8b..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import java.util.TimeZone; -import org.apache.iceberg.expressions.Literal; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.util.DateTimeUtils; -import org.junit.Assert; -import org.junit.Test; - -public class TestSparkDateTimes { - @Test - public void testSparkDate() { - // checkSparkDate("1582-10-14"); // -141428 - checkSparkDate("1582-10-15"); // first day of the gregorian calendar - checkSparkDate("1601-08-12"); - checkSparkDate("1801-07-04"); - checkSparkDate("1901-08-12"); - checkSparkDate("1969-12-31"); - checkSparkDate("1970-01-01"); - checkSparkDate("2017-12-25"); - checkSparkDate("2043-08-11"); - checkSparkDate("2111-05-03"); - checkSparkDate("2224-02-29"); - checkSparkDate("3224-10-05"); - } - - public void checkSparkDate(String dateString) { - Literal date = Literal.of(dateString).to(Types.DateType.get()); - String sparkDate = DateTimeUtils.toJavaDate(date.value()).toString(); - Assert.assertEquals("Should be the same date (" + date.value() + ")", dateString, sparkDate); - } - - @Test - public void testSparkTimestamp() { - TimeZone currentTz = TimeZone.getDefault(); - try { - TimeZone.setDefault(TimeZone.getTimeZone("UTC")); - checkSparkTimestamp("1582-10-15T15:51:08.440219+00:00", "1582-10-15 15:51:08.440219"); - checkSparkTimestamp("1970-01-01T00:00:00.000000+00:00", "1970-01-01 00:00:00"); - checkSparkTimestamp("2043-08-11T12:30:01.000001+00:00", "2043-08-11 12:30:01.000001"); - } finally { - TimeZone.setDefault(currentTz); - } - } - - public void checkSparkTimestamp(String timestampString, String sparkRepr) { - Literal ts = Literal.of(timestampString).to(Types.TimestampType.withZone()); - String sparkTimestamp = DateTimeUtils.timestampToString(ts.value()); - Assert.assertEquals( - "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java deleted file mode 100644 index 3c9037adc393..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Files; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.orc.OrcConf; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.apache.orc.StripeInformation; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.apache.spark.unsafe.types.UTF8String; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = - new Schema( - required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); - - private static final Schema PROJECTION_SCHEMA = - new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED); - - private static final int NUM_ROWS = 1000; - private static final List DATA_ROWS; - private static final List EXPECTED_ROWS; - - static { - DATA_ROWS = Lists.newArrayListWithCapacity(NUM_ROWS); - for (long i = 0; i < NUM_ROWS; i++) { - InternalRow row = new GenericInternalRow(DATA_SCHEMA.columns().size()); - row.update(0, i); - row.update(1, UTF8String.fromString("str" + i)); - DATA_ROWS.add(row); - } - - EXPECTED_ROWS = Lists.newArrayListWithCapacity(NUM_ROWS); - for (long i = 0; i < NUM_ROWS; i++) { - InternalRow row = new GenericInternalRow(PROJECTION_SCHEMA.columns().size()); - row.update(0, i); - row.update(1, UTF8String.fromString("str" + i)); - row.update(2, i); - row.update(3, false); - EXPECTED_ROWS.add(row); - } - } - - @Parameterized.Parameters(name = "vectorized = {0}") - public static Object[] parameters() { - return new Object[] {false, true}; - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private boolean vectorized; - private File testFile; - - public TestSparkOrcReadMetadataColumns(boolean vectorized) { - this.vectorized = vectorized; - } - - @Before - public void writeFile() throws IOException { - testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { - writer.addAll(DATA_ROWS); - } - } - - @Test - public void testReadRowNumbers() throws IOException { - readAndValidate(null, null, null, EXPECTED_ROWS); - } - - @Test - public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate( - Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); - } - - @Test - public void testReadRowNumbersWithSplits() throws IOException { - Reader reader; - try { - OrcFile.ReaderOptions readerOptions = - OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); - } catch (IOException ioe) { - throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); - } - List splitOffsets = - reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); - List splitLengths = - reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); - - for (int i = 0; i < 10; i++) { - readAndValidate( - null, - splitOffsets.get(i), - splitLengths.get(i), - EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); - } - } - - private void readAndValidate( - Expression filter, Long splitStart, Long splitLength, List expected) - throws IOException { - Schema projectionWithoutMetadataFields = - TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); - CloseableIterable reader = null; - try { - ORC.ReadBuilder builder = - ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); - - if (vectorized) { - builder = - builder.createBatchedReaderFunc( - readOrcSchema -> - VectorizedSparkOrcReaders.buildReader( - PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); - } else { - builder = - builder.createReaderFunc( - readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); - } - - if (filter != null) { - builder = builder.filter(filter); - } - - if (splitStart != null && splitLength != null) { - builder = builder.split(splitStart, splitLength); - } - - if (vectorized) { - reader = batchesToRows(builder.build()); - } else { - reader = builder.build(); - } - - final Iterator actualRows = reader.iterator(); - final Iterator expectedRows = expected.iterator(); - while (expectedRows.hasNext()) { - Assert.assertTrue("Should have expected number of rows", actualRows.hasNext()); - TestHelpers.assertEquals(PROJECTION_SCHEMA, expectedRows.next(), actualRows.next()); - } - Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); - } finally { - if (reader != null) { - reader.close(); - } - } - } - - private CloseableIterable batchesToRows(CloseableIterable batches) { - return CloseableIterable.combine( - Iterables.concat(Iterables.transform(batches, b -> (Iterable) b::rowIterator)), - batches); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java deleted file mode 100644 index b23fe729a187..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterators; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.junit.Assert; -import org.junit.Test; - -public class TestSparkOrcReader extends AvroDataTest { - @Override - protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData.generateSpark(schema, 100, 0L); - - writeAndValidateRecords(schema, expected); - } - - @Test - public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = - new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get())); - List expectedRepeating = - Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); - - writeAndValidateRecords(structSchema, expectedRepeating); - } - - private void writeAndValidateRecords(Schema schema, Iterable expected) - throws IOException { - final File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { - writer.addAll(expected); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { - final Iterator actualRows = reader.iterator(); - final Iterator expectedRows = expected.iterator(); - while (expectedRows.hasNext()) { - Assert.assertTrue("Should have expected number of rows", actualRows.hasNext()); - assertEquals(schema, expectedRows.next(), actualRows.next()); - } - Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); - } - - try (CloseableIterable reader = - ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc( - readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { - final Iterator actualRows = batchesToRows(reader.iterator()); - final Iterator expectedRows = expected.iterator(); - while (expectedRows.hasNext()) { - Assert.assertTrue("Should have expected number of rows", actualRows.hasNext()); - assertEquals(schema, expectedRows.next(), actualRows.next()); - } - Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); - } - } - - private Iterator batchesToRows(Iterator batches) { - return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java deleted file mode 100644 index 929d08f2cdb6..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import org.apache.arrow.vector.NullCheckingForGet; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Files; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; -import org.apache.iceberg.types.Types; -import org.apache.parquet.ParquetReadOptions; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.apache.spark.unsafe.types.UTF8String; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = - new Schema( - required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); - - private static final Schema PROJECTION_SCHEMA = - new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION); - - private static final int NUM_ROWS = 1000; - private static final List DATA_ROWS; - private static final List EXPECTED_ROWS; - private static final int NUM_ROW_GROUPS = 10; - private static final int ROWS_PER_SPLIT = NUM_ROWS / NUM_ROW_GROUPS; - private static final int RECORDS_PER_BATCH = ROWS_PER_SPLIT / 10; - - static { - DATA_ROWS = Lists.newArrayListWithCapacity(NUM_ROWS); - for (long i = 0; i < NUM_ROWS; i += 1) { - InternalRow row = new GenericInternalRow(DATA_SCHEMA.columns().size()); - if (i >= NUM_ROWS / 2) { - row.update(0, 2 * i); - } else { - row.update(0, i); - } - row.update(1, UTF8String.fromString("str" + i)); - DATA_ROWS.add(row); - } - - EXPECTED_ROWS = Lists.newArrayListWithCapacity(NUM_ROWS); - for (long i = 0; i < NUM_ROWS; i += 1) { - InternalRow row = new GenericInternalRow(PROJECTION_SCHEMA.columns().size()); - if (i >= NUM_ROWS / 2) { - row.update(0, 2 * i); - } else { - row.update(0, i); - } - row.update(1, UTF8String.fromString("str" + i)); - row.update(2, i); - EXPECTED_ROWS.add(row); - } - } - - @Parameterized.Parameters(name = "vectorized = {0}") - public static Object[][] parameters() { - return new Object[][] {new Object[] {false}, new Object[] {true}}; - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private final boolean vectorized; - private File testFile; - - public TestSparkParquetReadMetadataColumns(boolean vectorized) { - this.vectorized = vectorized; - } - - @Before - public void writeFile() throws IOException { - List fileSplits = Lists.newArrayList(); - StructType struct = SparkSchemaUtil.convert(DATA_SCHEMA); - Configuration conf = new Configuration(); - - testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = - new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath())); - - parquetFileWriter.start(); - for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - File split = temp.newFile(); - Assert.assertTrue("Delete should succeed", split.delete()); - fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = - Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { - writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); - } - parquetFileWriter.appendFile( - HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); - } - parquetFileWriter.end( - ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) - .getFileMetaData() - .getKeyValueMetaData()); - } - - @Test - public void testReadRowNumbers() throws IOException { - readAndValidate(null, null, null, EXPECTED_ROWS); - } - - @Test - public void testReadRowNumbersWithFilter() throws IOException { - // current iceberg supports row group filter. - for (int i = 1; i < 5; i += 1) { - readAndValidate( - Expressions.and( - Expressions.lessThan("id", NUM_ROWS / 2), - Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), - null, - null, - EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, NUM_ROWS / 2)); - } - } - - @Test - public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = - new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); - List rowGroups = fileReader.getRowGroups(); - for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate( - null, - rowGroups.get(i).getColumns().get(0).getStartingPos(), - rowGroups.get(i).getCompressedSize(), - EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); - } - } - - private void readAndValidate( - Expression filter, Long splitStart, Long splitLength, List expected) - throws IOException { - Parquet.ReadBuilder builder = - Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); - - if (vectorized) { - builder.createBatchedReaderFunc( - fileSchema -> - VectorizedSparkParquetReaders.buildReader( - PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); - builder.recordsPerBatch(RECORDS_PER_BATCH); - } else { - builder = - builder.createReaderFunc( - msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); - } - - if (filter != null) { - builder = builder.filter(filter); - } - - if (splitStart != null && splitLength != null) { - builder = builder.split(splitStart, splitLength); - } - - try (CloseableIterable reader = - vectorized ? batchesToRows(builder.build()) : builder.build()) { - final Iterator actualRows = reader.iterator(); - - for (InternalRow internalRow : expected) { - Assert.assertTrue("Should have expected number of rows", actualRows.hasNext()); - TestHelpers.assertEquals(PROJECTION_SCHEMA, internalRow, actualRows.next()); - } - - Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); - } - } - - private CloseableIterable batchesToRows(CloseableIterable batches) { - return CloseableIterable.combine( - Iterables.concat(Iterables.transform(batches, b -> (Iterable) b::rowIterator)), - batches); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java deleted file mode 100644 index 85ee15504825..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import org.apache.avro.generic.GenericData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Files; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.IcebergGenerics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetUtil; -import org.apache.iceberg.parquet.ParquetWriteAdapter; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.api.WriteSupport; -import org.apache.parquet.hadoop.util.HadoopOutputFile; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Test; - -public class TestSparkParquetReader extends AvroDataTest { - @Override - protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue( - "Parquet Avro cannot write non-string map keys", - null - == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - - List expected = RandomData.generateList(schema, 100, 0L); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { - writer.addAll(expected); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { - Iterator rows = reader.iterator(); - for (GenericData.Record record : expected) { - Assert.assertTrue("Should have expected number of rows", rows.hasNext()); - assertEqualsUnsafe(schema.asStruct(), record, rows.next()); - } - Assert.assertFalse("Should not have extra rows", rows.hasNext()); - } - } - - protected List rowsFromFile(InputFile inputFile, Schema schema) throws IOException { - try (CloseableIterable reader = - Parquet.read(inputFile) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { - return Lists.newArrayList(reader); - } - } - - protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IOException { - HadoopTables tables = new HadoopTables(); - Table table = - tables.create( - schema, - PartitionSpec.unpartitioned(), - ImmutableMap.of(), - temp.newFolder().getCanonicalPath()); - - table - .newAppend() - .appendFile( - DataFiles.builder(PartitionSpec.unpartitioned()) - .withFormat(FileFormat.PARQUET) - .withInputFile(inputFile) - .withMetrics(ParquetUtil.fileMetrics(inputFile, MetricsConfig.getDefault())) - .withFileSizeInBytes(inputFile.getLength()) - .build()) - .commit(); - - return table; - } - - @Test - public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = - String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); - HadoopOutputFile outputFile = - HadoopOutputFile.fromPath( - new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); - Schema schema = new Schema(required(1, "ts", Types.TimestampType.withZone())); - StructType sparkSchema = - new StructType( - new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) - }); - List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); - - try (FileAppender writer = - new ParquetWriteAdapter<>( - new NativeSparkWriterBuilder(outputFile) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.outputTimestampType", "INT96") - .build(), - MetricsConfig.getDefault())) { - writer.addAll(rows); - } - - InputFile parquetInputFile = Files.localInput(outputFilePath); - List readRows = rowsFromFile(parquetInputFile, schema); - Assert.assertEquals(rows.size(), readRows.size()); - Assertions.assertThat(readRows).isEqualTo(rows); - - // Now we try to import that file as an Iceberg table to make sure Iceberg can read - // Int96 end to end. - Table int96Table = tableFromInputFile(parquetInputFile, schema); - List tableRecords = Lists.newArrayList(IcebergGenerics.read(int96Table).build()); - - Assert.assertEquals(rows.size(), tableRecords.size()); - - for (int i = 0; i < tableRecords.size(); i++) { - GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); - } - } - - /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's - * native ParquetWriteSupport. - */ - private static class NativeSparkWriterBuilder - extends ParquetWriter.Builder { - private final Map config = Maps.newHashMap(); - - NativeSparkWriterBuilder(org.apache.parquet.io.OutputFile path) { - super(path); - } - - public NativeSparkWriterBuilder set(String property, String value) { - this.config.put(property, value); - return self(); - } - - @Override - protected NativeSparkWriterBuilder self() { - return this; - } - - @Override - protected WriteSupport getWriteSupport(Configuration configuration) { - for (Map.Entry entry : config.entrySet()) { - configuration.set(entry.getKey(), entry.getValue()); - } - - return new org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport(); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java deleted file mode 100644 index 261fb8838aa4..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkParquetWriter { - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = - new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required( - 5, - "strict", - Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional( - 6, - "hopeful", - Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()))), - optional(10, "vehement", Types.LongType.get()))), - optional( - 11, - "metamorphosis", - Types.MapType.ofRequired( - 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), - required( - 14, - "winter", - Types.ListType.ofOptional( - 15, - Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get())))), - optional( - 19, - "renovate", - Types.MapType.ofRequired( - 20, - 21, - Types.StringType.get(), - Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get())))), - optional(2, "slide", Types.StringType.get())); - - @Test - public void testCorrectness() throws IOException { - int numRows = 50_000; - Iterable records = RandomData.generateSpark(COMPLEX_SCHEMA, numRows, 19981); - - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc( - msgType -> - SparkParquetWriters.buildWriter( - SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { - writer.addAll(records); - } - - try (CloseableIterable reader = - Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { - Iterator expected = records.iterator(); - Iterator rows = reader.iterator(); - for (int i = 0; i < numRows; i += 1) { - Assert.assertTrue("Should have expected number of rows", rows.hasNext()); - TestHelpers.assertEquals(COMPLEX_SCHEMA, expected.next(), rows.next()); - } - Assert.assertFalse("Should not have extra rows", rows.hasNext()); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java deleted file mode 100644 index d10e7f5a19e3..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data; - -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.math.BigDecimal; -import java.util.Iterator; -import java.util.List; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; -import org.junit.Test; - -public class TestSparkRecordOrcReaderWriter extends AvroDataTest { - private static final int NUM_RECORDS = 200; - - private void writeAndValidate(Schema schema, List expectedRecords) throws IOException { - final File originalFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", originalFile.delete()); - - // Write few generic records into the original test file. - try (FileAppender writer = - ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { - writer.addAll(expectedRecords); - } - - // Read into spark InternalRow from the original test file. - List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = - ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { - reader.forEach(internalRows::add); - assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); - } - - final File anotherFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", anotherFile.delete()); - - // Write those spark InternalRows into a new file again. - try (FileAppender writer = - ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { - writer.addAll(internalRows); - } - - // Check whether the InternalRows are expected records. - try (CloseableIterable reader = - ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { - assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); - } - - // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = - ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { - assertRecordEquals(expectedRecords, reader, expectedRecords.size()); - } - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1992L); - writeAndValidate(schema, expectedRecords); - } - - @Test - public void testDecimalWithTrailingZero() throws IOException { - Schema schema = - new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20))); - - List expected = Lists.newArrayList(); - - GenericRecord record = GenericRecord.create(schema); - record.set(0, new BigDecimal("101.00")); - record.set(1, new BigDecimal("10.00E-3")); - record.set(2, new BigDecimal("1001.0000E-16")); - - expected.add(record.copy()); - - writeAndValidate(schema, expected); - } - - private static void assertRecordEquals( - Iterable expected, Iterable actual, int size) { - Iterator expectedIter = expected.iterator(); - Iterator actualIter = actual.iterator(); - for (int i = 0; i < size; i += 1) { - Assert.assertTrue("Expected iterator should have more rows", expectedIter.hasNext()); - Assert.assertTrue("Actual iterator should have more rows", actualIter.hasNext()); - Assert.assertEquals("Should have same rows.", expectedIter.next(), actualIter.next()); - } - Assert.assertFalse("Expected iterator should not have any extra rows.", expectedIter.hasNext()); - Assert.assertFalse("Actual iterator should not have any extra rows.", actualIter.hasNext()); - } - - private static void assertEqualsUnsafe( - Types.StructType struct, Iterable expected, Iterable actual, int size) { - Iterator expectedIter = expected.iterator(); - Iterator actualIter = actual.iterator(); - for (int i = 0; i < size; i += 1) { - Assert.assertTrue("Expected iterator should have more rows", expectedIter.hasNext()); - Assert.assertTrue("Actual iterator should have more rows", actualIter.hasNext()); - GenericsHelpers.assertEqualsUnsafe(struct, expectedIter.next(), actualIter.next()); - } - Assert.assertFalse("Expected iterator should not have any extra rows.", expectedIter.hasNext()); - Assert.assertFalse("Actual iterator should not have any extra rows.", actualIter.hasNext()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java deleted file mode 100644 index 756f49a2aad6..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet.vectorized; - -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - -import java.io.File; -import java.io.IOException; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Function; -import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.spark.data.RandomData; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; - -public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { - - @Override - Iterable generateData( - Schema schema, - int numRecords, - long seed, - float nullPercentage, - Function transform) { - Iterable data = - RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); - return transform == IDENTITY ? data : Iterables.transform(data, transform); - } - - @Test - @Override - @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException {} - - @Test - public void testMixedDictionaryNonDictionaryReads() throws IOException { - Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); - File dictionaryEncodedFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = - RandomData.generateDictionaryEncodableData( - schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = - getParquetWriter(schema, dictionaryEncodedFile)) { - writer.addAll(dictionaryEncodableData); - } - - File plainEncodingFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = - RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { - writer.addAll(nonDictionaryData); - } - - int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - File mixedFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat( - ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, - rowGroupSize, - schema, - ImmutableMap.of()); - assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java deleted file mode 100644 index 42ea34936b5f..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet.vectorized; - -import java.io.File; -import java.io.IOException; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Function; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.spark.data.RandomData; -import org.junit.Ignore; -import org.junit.Test; - -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads - extends TestParquetVectorizedReads { - private static final int NUM_ROWS = 1_000_000; - - @Override - protected int getNumRows() { - return NUM_ROWS; - } - - @Override - Iterable generateData( - Schema schema, - int numRecords, - long seed, - float nullPercentage, - Function transform) { - // TODO: take into account nullPercentage when generating fallback encoding data - Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); - return transform == IDENTITY ? data : Iterables.transform(data, transform); - } - - @Override - FileAppender getParquetWriter(Schema schema, File testFile) - throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .set(TableProperties.PARQUET_DICT_SIZE_BYTES, "512000") - .build(); - } - - @Test - @Override - @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() {} - - @Test - @Override - @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException {} -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java deleted file mode 100644 index 8908a23fad8f..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.data.parquet.vectorized; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.base.Function; -import org.apache.iceberg.relocated.com.google.common.base.Strings; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.data.AvroDataTest; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Type; -import org.apache.spark.sql.vectorized.ColumnarBatch; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Ignore; -import org.junit.Test; - -public class TestParquetVectorizedReads extends AvroDataTest { - private static final int NUM_ROWS = 200_000; - static final int BATCH_SIZE = 10_000; - - static final Function IDENTITY = record -> record; - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate(schema, getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, false, true); - } - - private void writeAndValidate( - Schema schema, - int numRecords, - long seed, - float nullPercentage, - boolean setAndCheckArrowValidityVector, - boolean reuseContainers) - throws IOException { - writeAndValidate( - schema, - numRecords, - seed, - nullPercentage, - setAndCheckArrowValidityVector, - reuseContainers, - BATCH_SIZE, - IDENTITY); - } - - private void writeAndValidate( - Schema schema, - int numRecords, - long seed, - float nullPercentage, - boolean setAndCheckArrowValidityVector, - boolean reuseContainers, - int batchSize, - Function transform) - throws IOException { - // Write test data - Assume.assumeTrue( - "Parquet Avro cannot write non-string map keys", - null - == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - - Iterable expected = - generateData(schema, numRecords, seed, nullPercentage, transform); - - // write a test parquet file using iceberg writer - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = getParquetWriter(schema, testFile)) { - writer.addAll(expected); - } - assertRecordsMatch( - schema, - numRecords, - expected, - testFile, - setAndCheckArrowValidityVector, - reuseContainers, - batchSize); - } - - protected int getNumRows() { - return NUM_ROWS; - } - - Iterable generateData( - Schema schema, - int numRecords, - long seed, - float nullPercentage, - Function transform) { - Iterable data = - RandomData.generate(schema, numRecords, seed, nullPercentage); - return transform == IDENTITY ? data : Iterables.transform(data, transform); - } - - FileAppender getParquetWriter(Schema schema, File testFile) - throws IOException { - return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); - } - - FileAppender getParquetV2Writer(Schema schema, File testFile) - throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - - void assertRecordsMatch( - Schema schema, - int expectedSize, - Iterable expected, - File testFile, - boolean setAndCheckArrowValidityBuffer, - boolean reuseContainers, - int batchSize) - throws IOException { - Parquet.ReadBuilder readBuilder = - Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc( - type -> - VectorizedSparkParquetReaders.buildReader( - schema, type, setAndCheckArrowValidityBuffer)); - if (reuseContainers) { - readBuilder.reuseContainers(); - } - try (CloseableIterable batchReader = readBuilder.build()) { - Iterator expectedIter = expected.iterator(); - Iterator batches = batchReader.iterator(); - int numRowsRead = 0; - while (batches.hasNext()) { - ColumnarBatch batch = batches.next(); - numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch( - schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); - } - Assert.assertEquals(expectedSize, numRowsRead); - } - } - - @Test - @Ignore - public void testArray() {} - - @Test - @Ignore - public void testArrayOfStructs() {} - - @Test - @Ignore - public void testMap() {} - - @Test - @Ignore - public void testNumericMapKey() {} - - @Test - @Ignore - public void testComplexMapKey() {} - - @Test - @Ignore - public void testMapOfStructs() {} - - @Test - @Ignore - public void testMixedTypes() {} - - @Test - @Override - public void testNestedStruct() { - AssertHelpers.assertThrows( - "Vectorized reads are not supported yet for struct fields", - UnsupportedOperationException.class, - "Vectorized reads are not supported yet for struct fields", - () -> - VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds( - new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), - new MessageType( - "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); - } - - @Test - public void testMostlyNullsForOptionalFields() throws IOException { - writeAndValidate( - TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), - 0L, - 0.99f, - false, - true); - } - - @Test - public void testSettingArrowValidityVector() throws IOException { - writeAndValidate( - new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, - true, - true); - } - - @Test - public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate( - TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, - true, - false); - } - - @Test - public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { - // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of - // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate( - new Schema( - Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), - 10, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, - true, - true, - 2, - record -> { - if (record.get("data") != null) { - record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); - } else { - record.put("data", Strings.padEnd("", 512, 'a')); - } - return record; - }); - } - - @Test - public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = - new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5))); - - File dataFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = - generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); - try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { - writer.addAll(data); - } - - Schema readSchema = - new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - - assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); - } - - @Test - public void testSupportedReadsForParquetV2() throws Exception { - // Only float and double column types are written using plain encoding with Parquet V2 - Schema schema = - new Schema( - optional(102, "float_data", Types.FloatType.get()), - optional(103, "double_data", Types.DoubleType.get())); - - File dataFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = - generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); - try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { - writer.addAll(data); - } - assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); - } - - @Test - public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized - // reads - Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); - File dataFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = - generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); - try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { - writer.addAll(data); - } - AssertHelpers.assertThrows( - "Vectorized reads not supported", - UnsupportedOperationException.class, - "Cannot support vectorized reads for column", - () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); - return null; - }); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ComplexRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ComplexRecord.java deleted file mode 100644 index 42e8552578cd..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ComplexRecord.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -public class ComplexRecord { - private long id; - private NestedRecord struct; - - public ComplexRecord() {} - - public ComplexRecord(long id, NestedRecord struct) { - this.id = id; - this.struct = struct; - } - - public long getId() { - return id; - } - - public void setId(long id) { - this.id = id; - } - - public NestedRecord getStruct() { - return struct; - } - - public void setStruct(NestedRecord struct) { - this.struct = struct; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - ComplexRecord record = (ComplexRecord) o; - return id == record.id && Objects.equal(struct, record.struct); - } - - @Override - public int hashCode() { - return Objects.hashCode(id, struct); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("id", id).add("struct", struct).toString(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java deleted file mode 100644 index 53a35eec61ce..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.time.Instant; -import java.util.concurrent.atomic.AtomicInteger; - -public class LogMessage { - private static AtomicInteger idCounter = new AtomicInteger(0); - - static LogMessage debug(String date, String message) { - return new LogMessage(idCounter.getAndIncrement(), date, "DEBUG", message); - } - - static LogMessage debug(String date, String message, Instant timestamp) { - return new LogMessage(idCounter.getAndIncrement(), date, "DEBUG", message, timestamp); - } - - static LogMessage info(String date, String message) { - return new LogMessage(idCounter.getAndIncrement(), date, "INFO", message); - } - - static LogMessage info(String date, String message, Instant timestamp) { - return new LogMessage(idCounter.getAndIncrement(), date, "INFO", message, timestamp); - } - - static LogMessage error(String date, String message) { - return new LogMessage(idCounter.getAndIncrement(), date, "ERROR", message); - } - - static LogMessage error(String date, String message, Instant timestamp) { - return new LogMessage(idCounter.getAndIncrement(), date, "ERROR", message, timestamp); - } - - static LogMessage warn(String date, String message) { - return new LogMessage(idCounter.getAndIncrement(), date, "WARN", message); - } - - static LogMessage warn(String date, String message, Instant timestamp) { - return new LogMessage(idCounter.getAndIncrement(), date, "WARN", message, timestamp); - } - - private int id; - private String date; - private String level; - private String message; - private Instant timestamp; - - private LogMessage(int id, String date, String level, String message) { - this.id = id; - this.date = date; - this.level = level; - this.message = message; - } - - private LogMessage(int id, String date, String level, String message, Instant timestamp) { - this.id = id; - this.date = date; - this.level = level; - this.message = message; - this.timestamp = timestamp; - } - - public int getId() { - return id; - } - - public void setId(int id) { - this.id = id; - } - - public String getDate() { - return date; - } - - public void setDate(String date) { - this.date = date; - } - - public String getLevel() { - return level; - } - - public void setLevel(String level) { - this.level = level; - } - - public String getMessage() { - return message; - } - - public void setMessage(String message) { - this.message = message; - } - - public Instant getTimestamp() { - return timestamp; - } - - public void setTimestamp(Instant timestamp) { - this.timestamp = timestamp; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java deleted file mode 100644 index 3a8d087258a7..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Map; -import java.util.Optional; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.spark.SparkWriteConf; -import org.apache.iceberg.types.TypeUtil; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.sources.DataSourceRegister; -import org.apache.spark.sql.sources.v2.DataSourceOptions; -import org.apache.spark.sql.sources.v2.DataSourceV2; -import org.apache.spark.sql.sources.v2.WriteSupport; -import org.apache.spark.sql.sources.v2.writer.DataSourceWriter; -import org.apache.spark.sql.types.StructType; - -public class ManualSource implements WriteSupport, DataSourceRegister, DataSourceV2 { - public static final String SHORT_NAME = "manual_source"; - public static final String TABLE_NAME = "table_name"; - private static final Map tableMap = Maps.newHashMap(); - - private SparkSession lazySpark = null; - private Configuration lazyConf = null; - - public static void setTable(String name, Table table) { - Preconditions.checkArgument( - !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); - tableMap.put(name, table); - } - - public static void clearTables() { - tableMap.clear(); - } - - @Override - public String shortName() { - return SHORT_NAME; - } - - @Override - public Optional createWriter( - String writeUUID, StructType dsStruct, SaveMode mode, DataSourceOptions options) { - - Map properties = options.asMap(); - Preconditions.checkArgument( - properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); - String tableName = properties.get(TABLE_NAME); - Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); - Table table = tableMap.get(tableName); - - SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); - Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema( - table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); - SparkUtil.validatePartitionTransforms(table.spec()); - String appId = lazySparkSession().sparkContext().applicationId(); - String wapId = writeConf.wapId(); - boolean replacePartitions = mode == SaveMode.Overwrite; - - return Optional.of( - new Writer( - lazySparkSession(), - table, - writeConf, - replacePartitions, - appId, - wapId, - writeSchema, - dsStruct)); - } - - private SparkSession lazySparkSession() { - if (lazySpark == null) { - this.lazySpark = SparkSession.builder().getOrCreate(); - } - return lazySpark; - } - - private Configuration lazyBaseConf() { - if (lazyConf == null) { - this.lazyConf = lazySparkSession().sessionState().newHadoopConf(); - } - return lazyConf; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/NestedRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/NestedRecord.java deleted file mode 100644 index ca36bfd4938b..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/NestedRecord.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -public class NestedRecord { - private long innerId; - private String innerName; - - public NestedRecord() {} - - public NestedRecord(long innerId, String innerName) { - this.innerId = innerId; - this.innerName = innerName; - } - - public long getInnerId() { - return innerId; - } - - public String getInnerName() { - return innerName; - } - - public void setInnerId(long iId) { - innerId = iId; - } - - public void setInnerName(String name) { - innerName = name; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - - if (o == null || getClass() != o.getClass()) { - return false; - } - - NestedRecord that = (NestedRecord) o; - return innerId == that.innerId && Objects.equal(innerName, that.innerName); - } - - @Override - public int hashCode() { - return Objects.hashCode(innerId, innerName); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("innerId", innerId) - .add("innerName", innerName) - .toString(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java deleted file mode 100644 index 550e20b9338e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.relocated.com.google.common.base.Objects; - -public class SimpleRecord { - private Integer id; - private String data; - - public SimpleRecord() {} - - public SimpleRecord(Integer id, String data) { - this.id = id; - this.data = data; - } - - public Integer getId() { - return id; - } - - public void setId(Integer id) { - this.id = id; - } - - public String getData() { - return data; - } - - public void setData(String data) { - this.data = data; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - SimpleRecord record = (SimpleRecord) o; - return Objects.equal(id, record.id) && Objects.equal(data, record.data); - } - - @Override - public int hashCode() { - return Objects.hashCode(id, data); - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append("{\"id\"="); - buffer.append(id); - buffer.append(",\"data\"=\""); - buffer.append(data); - buffer.append("\"}"); - return buffer.toString(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java deleted file mode 100644 index 9491adde4605..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localOutput; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import org.apache.avro.generic.GenericData.Record; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.spark.data.AvroDataTest; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; - -public class TestAvroScan extends AvroDataTest { - private static final Configuration CONF = new Configuration(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestAvroScan.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestAvroScan.spark; - TestAvroScan.spark = null; - currentSpark.stop(); - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - File parent = temp.newFolder("avro"); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - dataFolder.mkdirs(); - - File avroFile = - new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); - - HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); - - // Important: use the table's schema for the rest of the test - // When tables are created, the column ids are reassigned. - Schema tableSchema = table.schema(); - - List expected = RandomData.generateList(tableSchema, 100, 1L); - - try (FileAppender writer = - Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { - writer.addAll(expected); - } - - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); - - table.newAppend().appendFile(file).commit(); - - Dataset df = spark.read().format("iceberg").load(location.toString()); - - List rows = df.collectAsList(); - Assert.assertEquals("Should contain 100 rows", 100, rows.size()); - - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i)); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java deleted file mode 100644 index 3e0c9cbb4052..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.Transaction; -import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -public class TestCatalog implements Catalog, Configurable { - - private HadoopTables tables; - private Configuration conf; - private String warehouse; - - public TestCatalog() {} - - @Override - public String name() { - return "test-tables"; - } - - private String tablePath(TableIdentifier identifier) { - return String.format("%s/%s", warehouse, identifier.name()); - } - - @Override - public List listTables(Namespace namespace) { - throw new UnsupportedOperationException(); - } - - @Override - public Table createTable( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec, - String location, - Map properties) { - return tables.create(schema, spec, properties, tablePath(identifier)); - } - - @Override - public Transaction newCreateTableTransaction( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec, - String location, - Map properties) { - throw new UnsupportedOperationException(); - } - - @Override - public Transaction newReplaceTableTransaction( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec, - String location, - Map properties, - boolean orCreate) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean dropTable(TableIdentifier identifier, boolean purge) { - return tables.dropTable(tablePath(identifier), purge); - } - - @Override - public void renameTable(TableIdentifier from, TableIdentifier to) { - throw new UnsupportedOperationException(); - } - - @Override - public Table loadTable(TableIdentifier identifier) { - return tables.load(tablePath(identifier)); - } - - @Override - public void initialize(String name, Map properties) { - String uri = properties.get(CatalogProperties.URI); - warehouse = properties.get("warehouse"); - Preconditions.checkArgument( - uri != null, "Cannot initialize TestCatalog. The metastore connection uri must be set."); - Preconditions.checkArgument( - uri.contains("thrift"), - "Cannot initialize TestCatalog. The metastore connection uri must use thrift as the scheme."); - Preconditions.checkArgument( - warehouse != null, - "Cannot initialize TestCatalog. The base path for the catalog's warehouse directory must be set."); - this.tables = new HadoopTables(conf); - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Configuration getConf() { - return this.conf; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java deleted file mode 100644 index d003fb1f65d4..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.util.List; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.CatalogProperties; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestCustomCatalog { - private static final String CATALOG_IMPL = - String.format( - "%s.%s.%s", - CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, - CatalogProperties.CATALOG_IMPL); - private static final String WAREHOUSE = - String.format( - "%s.%s.%s", - CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, - CatalogProperties.WAREHOUSE_LOCATION); - private static final String URI_KEY = - String.format( - "%s.%s.%s", - CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, - CatalogProperties.URI); - private static final String TEST_CATALOG = "placeholder_catalog"; - private static final String TEST_CATALOG_IMPL = - String.format( - "%s.%s.%s", - CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.CATALOG_IMPL); - private static final String TEST_WAREHOUSE = - String.format( - "%s.%s.%s", - CustomCatalogs.ICEBERG_CATALOG_PREFIX, - TEST_CATALOG, - CatalogProperties.WAREHOUSE_LOCATION); - private static final String TEST_URI_KEY = - String.format( - "%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.URI); - private static final String URI_VAL = "thrift://localhost:12345"; // dummy uri - private static final String CATALOG_VAL = "org.apache.iceberg.spark.source.TestCatalog"; - private static final TableIdentifier TABLE = TableIdentifier.of("default", "table"); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - File tableDir = null; - String tableLocation = null; - HadoopTables tables; - - protected static SparkSession spark = null; - - @BeforeClass - public static void startMetastoreAndSpark() { - spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopMetastoreAndSpark() { - spark.stop(); - spark = null; - } - - @Before - public void setupTable() throws Exception { - SparkConf sparkConf = spark.sparkContext().conf(); - sparkConf.set( - String.format( - "%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG), - "placeholder"); - sparkConf.set( - String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG), "placeholder"); - this.tables = new HadoopTables(spark.sessionState().newHadoopConf()); - this.tableDir = temp.newFolder(); - tableDir.delete(); // created by table create - this.tableLocation = tableDir.toURI().toString(); - tables.create( - SCHEMA, PartitionSpec.unpartitioned(), String.format("%s/%s", tableLocation, TABLE.name())); - } - - @After - public void removeTable() { - SparkConf sparkConf = spark.sparkContext().conf(); - sparkConf.remove(CATALOG_IMPL); - sparkConf.remove(WAREHOUSE); - sparkConf.remove(URI_KEY); - tables.dropTable(String.format("%s/%s", tableLocation, TABLE.name())); - tableDir.delete(); - CustomCatalogs.clearCache(); - } - - @Test - public void withSparkOptions() { - - SparkConf sparkConf = spark.sparkContext().conf(); - sparkConf.set(CATALOG_IMPL, CATALOG_VAL); - sparkConf.set(URI_KEY, URI_VAL); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows( - "We have not set all properties", - IllegalArgumentException.class, - "The base path for the catalog's warehouse directory must be set", - () -> - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(TABLE.toString())); - - sparkConf.set(WAREHOUSE, tableLocation); - - df.select("id", "data").write().format("iceberg").mode("append").save(TABLE.toString()); - - List dfNew = - spark - .read() - .format("iceberg") - .load(TABLE.toString()) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Data should match", expected, dfNew); - } - - @Test - public void withSparkCatalog() { - - String catalogTable = String.format("%s.%s", TEST_CATALOG, TABLE.toString()); - SparkConf sparkConf = spark.sparkContext().conf(); - sparkConf.set(TEST_CATALOG_IMPL, CATALOG_VAL); - sparkConf.set(TEST_URI_KEY, URI_VAL); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows( - "We have not set all properties", - IllegalArgumentException.class, - "The base path for the catalog's warehouse directory must be set", - () -> df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable)); - - sparkConf.set(TEST_WAREHOUSE, tableLocation); - - df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable); - - List dfNew = - spark - .read() - .format("iceberg") - .load(catalogTable) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Data should match", expected, dfNew); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java deleted file mode 100644 index b3ceb0e2cbee..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ /dev/null @@ -1,422 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - -import java.io.File; -import java.io.IOException; -import java.net.URI; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Random; -import org.apache.avro.generic.GenericData.Record; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.avro.AvroIterable; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkSQLProperties; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.spark.data.AvroDataTest; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.SparkAvroReader; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkException; -import org.apache.spark.TaskContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapPartitionsFunction; -import org.apache.spark.sql.DataFrameWriter; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.encoders.RowEncoder; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestDataFrameWrites extends AvroDataTest { - private static final Configuration CONF = new Configuration(); - - private final String format; - - @Parameterized.Parameters(name = "format = {0}") - public static Object[] parameters() { - return new Object[] {"parquet", "avro", "orc"}; - } - - public TestDataFrameWrites(String format) { - this.format = format; - } - - private static SparkSession spark = null; - private static JavaSparkContext sc = null; - - private Map tableProperties; - - private org.apache.spark.sql.types.StructType sparkSchema = - new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = - new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = - Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = - Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); - - @BeforeClass - public static void startSpark() { - TestDataFrameWrites.spark = SparkSession.builder().master("local[2]").getOrCreate(); - TestDataFrameWrites.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestDataFrameWrites.spark; - TestDataFrameWrites.spark = null; - TestDataFrameWrites.sc = null; - currentSpark.stop(); - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - File location = createTableFolder(); - Table table = createTable(schema, location); - writeAndValidateWithLocations(table, location, new File(location, "data")); - } - - @Test - public void testWriteWithCustomDataLocation() throws IOException { - File location = createTableFolder(); - File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); - Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table - .updateProperties() - .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) - .commit(); - writeAndValidateWithLocations(table, location, tablePropertyDataLocation); - } - - private File createTableFolder() throws IOException { - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test"); - Assert.assertTrue("Mkdir should succeed", location.mkdirs()); - return location; - } - - private Table createTable(Schema schema, File location) { - HadoopTables tables = new HadoopTables(CONF); - return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); - } - - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) - throws IOException { - Schema tableSchema = table.schema(); // use the table schema because ids are reassigned - - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - Iterable expected = RandomData.generate(tableSchema, 100, 0L); - writeData(expected, tableSchema, location.toString()); - - table.refresh(); - - List actual = readTable(location.toString()); - - Iterator expectedIter = expected.iterator(); - Iterator actualIter = actual.iterator(); - while (expectedIter.hasNext() && actualIter.hasNext()) { - assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); - } - Assert.assertEquals( - "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table - .currentSnapshot() - .addedDataFiles(table.io()) - .forEach( - dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), dataFile.path()), - URI.create(dataFile.path().toString()) - .getPath() - .startsWith(expectedDataDir.getAbsolutePath()))); - } - - private List readTable(String location) { - Dataset result = spark.read().format("iceberg").load(location); - - return result.collectAsList(); - } - - private void writeData(Iterable records, Schema schema, String location) - throws IOException { - Dataset df = createDataset(records, schema); - DataFrameWriter writer = df.write().format("iceberg").mode("append"); - writer.save(location); - } - - private void writeDataWithFailOnPartition( - Iterable records, Schema schema, String location) throws IOException, SparkException { - final int numPartitions = 10; - final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = - (MapPartitionsFunction) - input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException( - String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = - createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); - // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" - // nullability of the column to "true" regardless of original nullability. - // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. - Dataset convertedDf = df.sqlContext().createDataFrame(df.rdd(), convert(schema)); - DataFrameWriter writer = convertedDf.write().format("iceberg").mode("append"); - writer.save(location); - } - - private Dataset createDataset(Iterable records, Schema schema) throws IOException { - // this uses the SparkAvroReader to create a DataFrame from the list of records - // it assumes that SparkAvroReader is correct - File testFile = temp.newFile(); - Assert.assertTrue("Delete should succeed", testFile.delete()); - - try (FileAppender writer = - Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { - for (Record rec : records) { - writer.add(rec); - } - } - - // make sure the dataframe matches the records before moving on - List rows = Lists.newArrayList(); - try (AvroIterable reader = - Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { - - Iterator recordIter = records.iterator(); - Iterator readIter = reader.iterator(); - while (recordIter.hasNext() && readIter.hasNext()) { - InternalRow row = readIter.next(); - assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); - rows.add(row); - } - Assert.assertEquals( - "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); - } - - JavaRDD rdd = sc.parallelize(rows); - return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false); - } - - @Test - public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue( - "Spark 3 rejects writing nulls to a required column", spark.version().startsWith("2")); - - File location = new File(temp.newFolder("parquet"), "test"); - String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); - String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString()); - - tableProperties = ImmutableMap.of(TableProperties.WRITE_DATA_LOCATION, targetPath); - - // read this and append to iceberg dataset - spark - .read() - .schema(sparkSchema) - .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write() - .parquet(sourcePath); - - // this is our iceberg dataset to which we will append data - new HadoopTables(spark.sessionState().newHadoopConf()) - .create( - icebergSchema, - PartitionSpec.builderFor(icebergSchema).identity("requiredField").build(), - tableProperties, - targetPath); - - // this is the initial data inside the iceberg dataset - spark - .read() - .schema(sparkSchema) - .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(targetPath); - - // read from parquet and append to iceberg w/ nullability check disabled - spark - .read() - .schema(SparkSchemaUtil.convert(icebergSchema)) - .parquet(sourcePath) - .write() - .format("iceberg") - .option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append) - .save(targetPath); - - // read all data - List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); - Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } - - @Test - public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue( - "Spark 3 rejects writing nulls to a required column", spark.version().startsWith("2")); - - File location = new File(temp.newFolder("parquet"), "test"); - String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); - String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString()); - - tableProperties = ImmutableMap.of(TableProperties.WRITE_DATA_LOCATION, targetPath); - - // read this and append to iceberg dataset - spark - .read() - .schema(sparkSchema) - .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write() - .parquet(sourcePath); - - SparkSession newSparkSession = - SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); - - // this is our iceberg dataset to which we will append data - new HadoopTables(newSparkSession.sessionState().newHadoopConf()) - .create( - icebergSchema, - PartitionSpec.builderFor(icebergSchema).identity("requiredField").build(), - tableProperties, - targetPath); - - // this is the initial data inside the iceberg dataset - newSparkSession - .read() - .schema(sparkSchema) - .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(targetPath); - - // read from parquet and append to iceberg - newSparkSession - .read() - .schema(SparkSchemaUtil.convert(icebergSchema)) - .parquet(sourcePath) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(targetPath); - - // read all data - List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); - Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } - - @Test - public void testFaultToleranceOnWrite() throws IOException { - File location = createTableFolder(); - Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); - Table table = createTable(schema, location); - - Iterable records = RandomData.generate(schema, 100, 0L); - writeData(records, schema, location.toString()); - - table.refresh(); - - Snapshot snapshotBeforeFailingWrite = table.currentSnapshot(); - List resultBeforeFailingWrite = readTable(location.toString()); - - Iterable records2 = RandomData.generate(schema, 100, 0L); - Assertions.assertThatThrownBy( - () -> writeDataWithFailOnPartition(records2, schema, location.toString())) - .isInstanceOf(SparkException.class); - - table.refresh(); - - Snapshot snapshotAfterFailingWrite = table.currentSnapshot(); - List resultAfterFailingWrite = readTable(location.toString()); - - Assert.assertEquals(snapshotAfterFailingWrite, snapshotBeforeFailingWrite); - Assert.assertEquals(resultAfterFailingWrite, resultBeforeFailingWrite); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java deleted file mode 100644 index 4ae489edffe1..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.IOException; -import java.math.RoundingMode; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.math.LongMath; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestDataSourceOptions { - - private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static SparkSession spark = null; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @BeforeClass - public static void startSpark() { - TestDataSourceOptions.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestDataSourceOptions.spark; - TestDataSourceOptions.spark = null; - currentSpark.stop(); - } - - @Test - public void testWriteFormatOptionOverridesTableProperties() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); - Table table = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach( - task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); - } - } - - @Test - public void testNoWriteFormatOption() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); - Table table = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach( - task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); - } - } - - @Test - public void testHadoopOptions() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); - String originalDefaultFS = sparkHadoopConf.get("fs.default.name"); - - try { - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - tables.create(SCHEMA, spec, options, tableLocation); - - // set an invalid value for 'fs.default.name' in Spark Hadoop config - // to verify that 'hadoop.' data source options are propagated correctly - sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - - List expectedRecords = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .option("hadoop.fs.default.name", "file:///") - .save(tableLocation); - - Dataset resultDf = - spark - .read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = - resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Records should match", expectedRecords, resultRecords); - } finally { - sparkHadoopConf.set("fs.default.name", originalDefaultFS); - } - } - - @Test - public void testSplitOptionsOverridesTableProperties() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put( - TableProperties.DEFAULT_FILE_FORMAT, - String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable - Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf - .select("id", "data") - .repartition(1) - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); - - List files = - Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); - Assert.assertEquals("Should have written 1 file", 1, files.size()); - - long fileSize = files.get(0).fileSizeInBytes(); - long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - - Dataset resultDf = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); - - Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); - } - - @Test - public void testIncrementalScanOptions() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d")); - for (SimpleRecord record : expectedRecords) { - Dataset originalDf = - spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - } - List snapshotIds = SnapshotUtil.currentAncestorIds(table); - - // start-snapshot-id and snapshot-id are both configured. - AssertHelpers.assertThrows( - "Check both start-snapshot-id and snapshot-id are configured", - IllegalArgumentException.class, - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", - () -> { - spark - .read() - .format("iceberg") - .option("snapshot-id", snapshotIds.get(3).toString()) - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .explain(); - }); - - // end-snapshot-id and as-of-timestamp are both configured. - AssertHelpers.assertThrows( - "Check both start-snapshot-id and snapshot-id are configured", - IllegalArgumentException.class, - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", - () -> { - spark - .read() - .format("iceberg") - .option( - SparkReadOptions.AS_OF_TIMESTAMP, - Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) - .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation) - .explain(); - }); - - // only end-snapshot-id is configured. - AssertHelpers.assertThrows( - "Check both start-snapshot-id and snapshot-id are configured", - IllegalArgumentException.class, - "Cannot only specify option end-snapshot-id to do incremental scan", - () -> { - spark - .read() - .format("iceberg") - .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation) - .explain(); - }); - - // test (1st snapshot, current snapshot] incremental scan. - List result = - spark - .read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); - - // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = - spark - .read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); - } - - @Test - public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - // produce 1st manifest - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - // produce 2nd manifest - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - List manifests = table.currentSnapshot().allManifests(table.io()); - - Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); - - // set the target metadata split size so each manifest ends up in a separate split - table - .updateProperties() - .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) - .commit(); - - Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); - Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); - - // override the table property using options - entriesDf = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); - Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); - } - - @Test - public void testDefaultMetadataSplitSize() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map options = Maps.newHashMap(); - Table table = tables.create(SCHEMA, spec, options, tableLocation); - - List expectedRecords = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - - int expectedSplits = - ((int) - tables - .load(tableLocation + "#entries") - .currentSnapshot() - .allManifests(table.io()) - .get(0) - .length() - + splitSize - - 1) - / splitSize; - - Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); - - int partitionNum = metadataDf.javaRDD().getNumPartitions(); - Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); - } - - @Test - public void testExtraSnapshotMetadata() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - HadoopTables tables = new HadoopTables(CONF); - tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - - List expectedRecords = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); - - Table table = tables.load(tableLocation); - - Assert.assertTrue(table.currentSnapshot().summary().get("extra-key").equals("someValue")); - Assert.assertTrue(table.currentSnapshot().summary().get("another-key").equals("anotherValue")); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java deleted file mode 100644 index af3796cb1a41..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ /dev/null @@ -1,658 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - -import java.io.File; -import java.io.IOException; -import java.sql.Timestamp; -import java.time.OffsetDateTime; -import java.util.List; -import java.util.UUID; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.expressions.Literal; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.data.GenericsHelpers; -import org.apache.iceberg.transforms.Transform; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.api.java.UDF1; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.UnsafeRow; -import org.apache.spark.sql.sources.And; -import org.apache.spark.sql.sources.EqualTo; -import org.apache.spark.sql.sources.Filter; -import org.apache.spark.sql.sources.GreaterThan; -import org.apache.spark.sql.sources.In; -import org.apache.spark.sql.sources.LessThan; -import org.apache.spark.sql.sources.StringStartsWith; -import org.apache.spark.sql.sources.v2.DataSourceOptions; -import org.apache.spark.sql.sources.v2.reader.DataSourceReader; -import org.apache.spark.sql.sources.v2.reader.InputPartition; -import org.apache.spark.sql.sources.v2.reader.SupportsPushDownFilters; -import org.apache.spark.sql.types.IntegerType$; -import org.apache.spark.sql.types.LongType$; -import org.apache.spark.sql.types.StringType$; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestFilteredScan { - private static final Configuration CONF = new Configuration(); - private static final HadoopTables TABLES = new HadoopTables(CONF); - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get())); - - private static final PartitionSpec BUCKET_BY_ID = - PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - - private static final PartitionSpec PARTITION_BY_DAY = - PartitionSpec.builderFor(SCHEMA).day("ts").build(); - - private static final PartitionSpec PARTITION_BY_HOUR = - PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - - private static final PartitionSpec PARTITION_BY_DATA = - PartitionSpec.builderFor(SCHEMA).identity("data").build(); - - private static final PartitionSpec PARTITION_BY_ID = - PartitionSpec.builderFor(SCHEMA).identity("id").build(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestFilteredScan.spark = SparkSession.builder().master("local[2]").getOrCreate(); - - // define UDFs used by partition tests - Transform bucket4 = Transforms.bucket(Types.LongType.get(), 4); - spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); - - Transform day = Transforms.day(Types.TimestampType.withZone()); - spark - .udf() - .register( - "ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); - - Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark - .udf() - .register( - "ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); - - spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); - spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestFilteredScan.spark; - TestFilteredScan.spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private final String format; - private final boolean vectorized; - - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false}, - {"orc", false}, - {"orc", true} - }; - } - - public TestFilteredScan(String format, boolean vectorized) { - this.format = format; - this.vectorized = vectorized; - } - - private File parent = null; - private File unpartitioned = null; - private List records = null; - - @Before - public void writeUnpartitionedTable() throws IOException { - this.parent = temp.newFolder("TestFilteredScan"); - this.unpartitioned = new File(parent, "unpartitioned"); - File dataFolder = new File(unpartitioned, "data"); - Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs()); - - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString()); - Schema tableSchema = table.schema(); // use the table schema because ids are reassigned - - FileFormat fileFormat = FileFormat.fromString(format); - - File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString())); - - // create records using the table's schema - this.records = testRecords(tableSchema); - - try (FileAppender writer = - new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { - writer.addAll(records); - } - - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); - - table.newAppend().appendFile(file).commit(); - } - - @Test - public void testUnpartitionedIDFilters() { - DataSourceOptions options = - new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); - - IcebergSource source = new IcebergSource(); - - for (int i = 0; i < 10; i += 1) { - DataSourceReader reader = source.createReader(options); - - pushFilters(reader, EqualTo.apply("id", i)); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); - - // validate row filtering - assertEqualsSafe( - SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); - } - } - - @Test - public void testUnpartitionedCaseInsensitiveIDFilters() { - DataSourceOptions options = - new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); - - // set spark.sql.caseSensitive to false - String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); - TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false"); - - try { - IcebergSource source = new IcebergSource(); - - for (int i = 0; i < 10; i += 1) { - DataSourceReader reader = source.createReader(options); - - pushFilters( - reader, - EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); - - // validate row filtering - assertEqualsSafe( - SCHEMA.asStruct(), - expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); - } - } finally { - // return global conf to previous state - TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest); - } - } - - @Test - public void testUnpartitionedTimestampFilter() { - DataSourceOptions options = - new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); - - IcebergSource source = new IcebergSource(); - - DataSourceReader reader = source.createReader(options); - - pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); - - assertEqualsSafe( - SCHEMA.asStruct(), - expected(5, 6, 7, 8, 9), - read( - unpartitioned.toString(), - vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); - } - - @Test - public void testBucketPartitionedIDFilters() { - File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals( - "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().size()); - - for (int i = 0; i < 10; i += 1) { - DataSourceReader reader = source.createReader(options); - - pushFilters(reader, EqualTo.apply("id", i)); - - List> tasks = reader.planInputPartitions(); - - // validate predicate push-down - Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); - - // validate row filtering - assertEqualsSafe( - SCHEMA.asStruct(), expected(i), read(location.toString(), vectorized, "id = " + i)); - } - } - - @SuppressWarnings("checkstyle:AvoidNestedBlocks") - @Test - public void testDayPartitionedTimestampFilters() { - File location = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals( - "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().size()); - - { - DataSourceReader reader = source.createReader(options); - - pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.size()); - - assertEqualsSafe( - SCHEMA.asStruct(), - expected(5, 6, 7, 8, 9), - read( - location.toString(), - vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); - } - - { - DataSourceReader reader = source.createReader(options); - - pushFilters( - reader, - And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.size()); - - assertEqualsSafe( - SCHEMA.asStruct(), - expected(1, 2), - read( - location.toString(), - vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " - + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); - } - } - - @SuppressWarnings("checkstyle:AvoidNestedBlocks") - @Test - public void testHourPartitionedTimestampFilters() { - File location = - buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals( - "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().size()); - - { - DataSourceReader reader = source.createReader(options); - - pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.size()); - - assertEqualsSafe( - SCHEMA.asStruct(), - expected(8, 9, 7, 6, 5), - read( - location.toString(), - vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); - } - - { - DataSourceReader reader = source.createReader(options); - - pushFilters( - reader, - And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); - - List> tasks = reader.planInputPartitions(); - Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.size()); - - assertEqualsSafe( - SCHEMA.asStruct(), - expected(2, 1), - read( - location.toString(), - vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " - + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); - } - } - - @SuppressWarnings("checkstyle:AvoidNestedBlocks") - @Test - public void testFilterByNonProjectedColumn() { - { - Schema actualProjection = SCHEMA.select("id", "data"); - List expected = Lists.newArrayList(); - for (Record rec : expected(5, 6, 7, 8, 9)) { - expected.add(projectFlat(actualProjection, rec)); - } - - assertEqualsSafe( - actualProjection.asStruct(), - expected, - read( - unpartitioned.toString(), - vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", - "data")); - } - - { - // only project id: ts will be projected because of the filter, but data will not be included - - Schema actualProjection = SCHEMA.select("id"); - List expected = Lists.newArrayList(); - for (Record rec : expected(1, 2)) { - expected.add(projectFlat(actualProjection, rec)); - } - - assertEqualsSafe( - actualProjection.asStruct(), - expected, - read( - unpartitioned.toString(), - vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " - + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); - } - } - - @Test - public void testInFilter() { - File location = - buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader reader = source.createReader(options); - pushFilters(reader, new In("data", new String[] {"foo", "junction", "brush", null})); - - Assert.assertEquals(2, reader.planInputPartitions().size()); - } - - @Test - public void testInFilterForTimestamp() { - File location = - buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader reader = source.createReader(options); - pushFilters( - reader, - new In( - "ts", - new Timestamp[] { - new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000), - new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000), - new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000), - new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000), - null - })); - - Assert.assertEquals( - "Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size()); - } - - @Test - public void testPartitionedByDataStartsWithFilter() { - File location = - buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader reader = source.createReader(options); - pushFilters(reader, new StringStartsWith("data", "junc")); - - Assert.assertEquals(1, reader.planInputPartitions().size()); - } - - @Test - public void testPartitionedByIdStartsWith() { - File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); - - IcebergSource source = new IcebergSource(); - DataSourceReader reader = source.createReader(options); - pushFilters(reader, new StringStartsWith("data", "junc")); - - Assert.assertEquals(1, reader.planInputPartitions().size()); - } - - @Test - public void testUnpartitionedStartsWith() { - Dataset df = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - List matchedData = - df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); - - Assert.assertEquals(1, matchedData.size()); - Assert.assertEquals("junction", matchedData.get(0)); - } - - private static Record projectFlat(Schema projection, Record record) { - Record result = GenericRecord.create(projection); - List fields = projection.asStruct().fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - result.set(i, record.getField(field.name())); - } - return result; - } - - public static void assertEqualsUnsafe( - Types.StructType struct, List expected, List actual) { - // TODO: match records by ID - int numRecords = Math.min(expected.size(), actual.size()); - for (int i = 0; i < numRecords; i += 1) { - GenericsHelpers.assertEqualsUnsafe(struct, expected.get(i), actual.get(i)); - } - Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); - } - - public static void assertEqualsSafe( - Types.StructType struct, List expected, List actual) { - // TODO: match records by ID - int numRecords = Math.min(expected.size(), actual.size()); - for (int i = 0; i < numRecords; i += 1) { - GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); - } - Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); - } - - private List expected(int... ordinals) { - List expected = Lists.newArrayListWithExpectedSize(ordinals.length); - for (int ord : ordinals) { - expected.add(records.get(ord)); - } - return expected; - } - - private void pushFilters(DataSourceReader reader, Filter... filters) { - Assertions.assertThat(reader).isInstanceOf(SupportsPushDownFilters.class); - SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader; - filterable.pushFilters(filters); - } - - private File buildPartitionedTable( - String desc, PartitionSpec spec, String udf, String partitionColumn) { - File location = new File(parent, desc); - Table byId = TABLES.create(SCHEMA, spec, location.toString()); - - // Do not combine or split files because the tests expect a split per partition. - // A target split size of 2048 helps us achieve that. - byId.updateProperties().set("read.split.target-size", "2048").commit(); - - // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - allRows - .coalesce(1) // ensure only 1 file per partition is written - .withColumn("part", callUDF(udf, column(partitionColumn))) - .sortWithinPartitions("part") - .drop("part") - .write() - .format("iceberg") - .mode("append") - .save(byId.location()); - - return location; - } - - private List testRecords(Schema schema) { - return Lists.newArrayList( - record(schema, 0L, parse("2017-12-22T09:20:44.294658+00:00"), "junction"), - record(schema, 1L, parse("2017-12-22T07:15:34.582910+00:00"), "alligator"), - record(schema, 2L, parse("2017-12-22T06:02:09.243857+00:00"), ""), - record(schema, 3L, parse("2017-12-22T03:10:11.134509+00:00"), "clapping"), - record(schema, 4L, parse("2017-12-22T00:34:00.184671+00:00"), "brush"), - record(schema, 5L, parse("2017-12-21T22:20:08.935889+00:00"), "trap"), - record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), - record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), - record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); - } - - private static List read(String table, boolean vectorized, String expr) { - return read(table, vectorized, expr, "*"); - } - - private static List read( - String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table) - .filter(expr) - .select(select0, selectN); - return dataset.collectAsList(); - } - - private static OffsetDateTime parse(String timestamp) { - return OffsetDateTime.parse(timestamp); - } - - private static long instant(String timestamp) { - return Literal.of(timestamp).to(Types.TimestampType.withZone()).value(); - } - - private static Record record(Schema schema, Object... values) { - Record rec = GenericRecord.create(schema); - for (int i = 0; i < values.length; i += 1) { - rec.set(i, values[i]); - } - return rec; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java deleted file mode 100644 index da5677395f01..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import org.apache.avro.generic.GenericData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.PartitionSpecParser; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.execution.streaming.MemoryStream; -import org.apache.spark.sql.streaming.StreamingQuery; -import org.apache.spark.sql.streaming.StreamingQueryException; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import scala.collection.JavaConversions; - -public class TestForwardCompatibility { - private static final Configuration CONF = new Configuration(); - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); - - // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = - PartitionSpecParser.fromJson( - SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); - // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = - PartitionSpecParser.fromJson( - SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestForwardCompatibility.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestForwardCompatibility.spark; - TestForwardCompatibility.spark = null; - currentSpark.stop(); - } - - @Test - public void testSparkWriteFailsUnknownTransform() throws IOException { - File parent = temp.newFolder("avro"); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - dataFolder.mkdirs(); - - HadoopTables tables = new HadoopTables(CONF); - tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - AssertHelpers.assertThrows( - "Should reject write with unsupported transform", - UnsupportedOperationException.class, - "Cannot write using unsupported transforms: zero", - () -> - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(location.toString())); - } - - @Test - public void testSparkStreamingWriteFailsUnknownTransform() throws IOException { - File parent = temp.newFolder("avro"); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - dataFolder.mkdirs(); - File checkpoint = new File(parent, "checkpoint"); - checkpoint.mkdirs(); - - HadoopTables tables = new HadoopTables(CONF); - tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - - MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = - inputStream - .toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); - - List batch1 = Lists.newArrayList(1, 2); - send(batch1, inputStream); - - AssertHelpers.assertThrows( - "Should reject streaming write with unsupported transform", - StreamingQueryException.class, - "Cannot write using unsupported transforms: zero", - query::processAllAvailable); - } - - @Test - public void testSparkCanReadUnknownTransform() throws IOException { - File parent = temp.newFolder("avro"); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - dataFolder.mkdirs(); - - HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - - // enable snapshot inheritance to avoid rewriting the manifest with an unknown transform - table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - - List expected = RandomData.generateList(table.schema(), 100, 1L); - - File parquetFile = - new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = - Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); - try { - writer.addAll(expected); - } finally { - writer.close(); - } - - DataFile file = - DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); - - OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); - ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); - try { - manifestWriter.add(file); - } finally { - manifestWriter.close(); - } - - table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - - Dataset df = spark.read().format("iceberg").load(location.toString()); - - List rows = df.collectAsList(); - Assert.assertEquals("Should contain 100 rows", 100, rows.size()); - - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(table.schema().asStruct(), expected.get(i), rows.get(i)); - } - } - - private MemoryStream newMemoryStream(int id, SQLContext sqlContext, Encoder encoder) { - return new MemoryStream<>(id, sqlContext, encoder); - } - - private void send(List records, MemoryStream stream) { - stream.addData(JavaConversions.asScalaBuffer(records)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java deleted file mode 100644 index 72e7b72b508e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Table; -import org.apache.spark.sql.sources.v2.DataSourceOptions; - -public class TestIcebergSource extends IcebergSource { - @Override - public String shortName() { - return "iceberg-test"; - } - - @Override - protected Table findTable(DataSourceOptions options, Configuration conf) { - return TestTables.load(options.get("iceberg.table.name").get()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java deleted file mode 100644 index b55ba0e2199a..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.File; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.hadoop.HadoopTables; -import org.junit.Before; - -public class TestIcebergSourceHadoopTables extends TestIcebergSourceTablesBase { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - - File tableDir = null; - String tableLocation = null; - - @Before - public void setupTable() throws Exception { - this.tableDir = temp.newFolder(); - tableDir.delete(); // created by table create - - this.tableLocation = tableDir.toURI().toString(); - } - - @Override - public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec) { - if (spec.equals(PartitionSpec.unpartitioned())) { - return TABLES.create(schema, tableLocation); - } - return TABLES.create(schema, spec, tableLocation); - } - - @Override - public Table loadTable(TableIdentifier ident, String entriesSuffix) { - return TABLES.load(loadLocation(ident, entriesSuffix)); - } - - @Override - public String loadLocation(TableIdentifier ident, String entriesSuffix) { - return String.format("%s#%s", loadLocation(ident), entriesSuffix); - } - - @Override - public String loadLocation(TableIdentifier ident) { - return tableLocation; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java deleted file mode 100644 index f6df8d495b90..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.IOException; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.junit.After; -import org.junit.BeforeClass; - -public class TestIcebergSourceHiveTables extends TestIcebergSourceTablesBase { - - private static TableIdentifier currentIdentifier; - - @BeforeClass - public static void start() { - Namespace db = Namespace.of("db"); - if (!catalog.namespaceExists(db)) { - catalog.createNamespace(db); - } - } - - @After - public void dropTable() throws IOException { - if (currentIdentifier != null) { - Table table = catalog.loadTable(currentIdentifier); - Path tablePath = new Path(table.location()); - FileSystem fs = tablePath.getFileSystem(spark.sessionState().newHadoopConf()); - fs.delete(tablePath, true); - catalog.dropTable(currentIdentifier, false); - currentIdentifier = null; - } - } - - @Override - public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec) { - TestIcebergSourceHiveTables.currentIdentifier = ident; - return TestIcebergSourceHiveTables.catalog.createTable(ident, schema, spec); - } - - @Override - public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = - TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); - return TestIcebergSourceHiveTables.catalog.loadTable(identifier); - } - - @Override - public String loadLocation(TableIdentifier ident, String entriesSuffix) { - return String.format("%s.%s", loadLocation(ident), entriesSuffix); - } - - @Override - public String loadLocation(TableIdentifier ident) { - return ident.toString(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java deleted file mode 100644 index bf7e6e0960a0..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ /dev/null @@ -1,1801 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Comparator; -import java.util.List; -import java.util.StringJoiner; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.DeleteOrphanFiles; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.avro.AvroSchemaUtil; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.io.OutputFileFactory; -import org.apache.iceberg.mapping.MappingUtil; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.actions.SparkActions; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.spark.SparkException; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - private static final Schema SCHEMA2 = - new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get())); - - private static final Schema SCHEMA3 = - new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); - - public abstract Table loadTable(TableIdentifier ident, String entriesSuffix); - - public abstract String loadLocation(TableIdentifier ident, String entriesSuffix); - - public abstract String loadLocation(TableIdentifier ident); - - @Test - public synchronized void testTablesSupport() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); - createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); - - Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = - resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Records should match", expectedRecords, actualRecords); - } - - @Test - public void testEntriesTable() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Table entriesTable = loadTable(tableIdentifier, "entries"); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); - - Snapshot snapshot = table.currentSnapshot(); - - Assert.assertEquals( - "Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); - - InputFile manifest = table.io().newInputFile(snapshot.allManifests(table.io()).get(0).path()); - List expected = Lists.newArrayList(); - try (CloseableIterable rows = - Avro.read(manifest).project(entriesTable.schema()).build()) { - // each row must inherit snapshot_id and sequence_number - rows.forEach( - row -> { - row.put(2, 0L); // data sequence number - row.put(3, 0L); // file sequence number - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); - } - - Assert.assertEquals("Entries table should have one row", 1, expected.size()); - Assert.assertEquals("Actual results should have one row", 1, actual.size()); - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(0), actual.get(0)); - } - - @Test - public void testEntriesTablePartitionedPrune() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); - - Assert.assertEquals("Results should contain only one status", 1, actual.size()); - Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); - } - - @Test - public void testEntriesTableDataFilePrune() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - - List singleActual = - rowsToJava( - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); - - List singleExpected = ImmutableList.of(row(file.path())); - - assertEquals( - "Should prune a single element from a nested struct", singleExpected, singleActual); - } - - @Test - public void testEntriesTableDataFilePruneMulti() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - - List multiActual = - rowsToJava( - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select( - "data_file.file_path", - "data_file.value_counts", - "data_file.record_count", - "data_file.column_sizes") - .collectAsList()); - - List multiExpected = - ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); - - assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); - } - - @Test - public void testFilesSelectMap() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - - List multiActual = - rowsToJava( - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); - - List multiExpected = - ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); - - assertEquals("Should prune a single element from a row", multiExpected, multiActual); - } - - @Test - public void testAllEntriesTable() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Table entriesTable = loadTable(tableIdentifier, "all_entries"); - - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // delete the first file to test that not only live files are listed - table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // ensure table data isn't stale - table.refresh(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); - - List expected = Lists.newArrayList(); - for (ManifestFile manifest : - Iterables.concat( - Iterables.transform( - table.snapshots(), snapshot -> snapshot.allManifests(table.io())))) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTable.schema()).build()) { - // each row must inherit snapshot_id and sequence_number - rows.forEach( - row -> { - row.put(2, 0L); // data sequence number - row.put(3, 0L); // file sequence number - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); - } - } - - expected.sort(Comparator.comparing(o -> (Long) o.get("snapshot_id"))); - - Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); - Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); - } - } - - @Test - public void testCountEntriesTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "count_entries_test"); - createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - // init load - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - final int expectedEntryCount = 1; - - // count entries - Assert.assertEquals( - "Count should return " + expectedEntryCount, - expectedEntryCount, - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); - - // count all_entries - Assert.assertEquals( - "Count should return " + expectedEntryCount, - expectedEntryCount, - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); - } - - @Test - public void testFilesTable() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table entriesTable = loadTable(tableIdentifier, "entries"); - - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // delete the first file to test that only live files are listed - table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - - Dataset filesTableDs = - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")); - List actual = selectNonDerived(filesTableDs).collectAsList(); - - List expected = Lists.newArrayList(); - for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTable.schema()).build()) { - for (GenericData.Record record : rows) { - if ((Integer) record.get("status") < 2 /* added or existing */) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - asMetadataRecord(file); - expected.add(file); - } - } - } - } - - Assert.assertEquals("Files table should have one row", 1, expected.size()); - Assert.assertEquals("Actual results should have one row", 1, actual.size()); - TestHelpers.assertEqualsSafe(nonDerivedSchema(filesTableDs), expected.get(0), actual.get(0)); - } - - @Test - public void testFilesTableWithSnapshotIdInheritance() throws Exception { - spark.sql("DROP TABLE IF EXISTS parquet_table"); - - TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - Table entriesTable = loadTable(tableIdentifier, "entries"); - - spark.sql( - String.format( - "CREATE TABLE parquet_table (data string, id int) " - + "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); - - List records = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); - - NameMapping mapping = MappingUtil.create(table.schema()); - String mappingJson = NameMappingParser.toJson(mapping); - - table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); - - try { - String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable( - spark, - new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, - stagingLocation); - - Dataset filesTableDs = - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")); - List actual = selectNonDerived(filesTableDs).collectAsList(); - - List expected = Lists.newArrayList(); - for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTable.schema()).build()) { - for (GenericData.Record record : rows) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - asMetadataRecord(file); - expected.add(file); - } - } - } - - Types.StructType struct = nonDerivedSchema(filesTableDs); - Assert.assertEquals("Files table should have one row", 2, expected.size()); - Assert.assertEquals("Actual results should have one row", 2, actual.size()); - TestHelpers.assertEqualsSafe(struct, expected.get(0), actual.get(0)); - TestHelpers.assertEqualsSafe(struct, expected.get(1), actual.get(1)); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testEntriesTableWithSnapshotIdInheritance() throws Exception { - spark.sql("DROP TABLE IF EXISTS parquet_table"); - - TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_inheritance_test"); - PartitionSpec spec = SPEC; - Table table = createTable(tableIdentifier, SCHEMA, spec); - - table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - - spark.sql( - String.format( - "CREATE TABLE parquet_table (data string, id int) " - + "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); - - List records = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); - - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); - - try { - String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable( - spark, - new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, - stagingLocation); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); - - table.refresh(); - - long snapshotId = table.currentSnapshot().snapshotId(); - - Assert.assertEquals("Entries table should have 2 rows", 2, actual.size()); - Assert.assertEquals("Sequence number must match", 0, actual.get(0).getLong(0)); - Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(0).getLong(1)); - Assert.assertEquals("Sequence number must match", 0, actual.get(1).getLong(0)); - Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(1).getLong(1)); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testFilesUnpartitionedTable() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_files_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Table entriesTable = loadTable(tableIdentifier, "entries"); - - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - DataFile toDelete = - Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // delete the first file to test that only live files are listed - table.newDelete().deleteFile(toDelete).commit(); - - Dataset filesTableDs = - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")); - List actual = selectNonDerived(filesTableDs).collectAsList(); - - List expected = Lists.newArrayList(); - for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTable.schema()).build()) { - for (GenericData.Record record : rows) { - if ((Integer) record.get("status") < 2 /* added or existing */) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - asMetadataRecord(file); - expected.add(file); - } - } - } - } - - Assert.assertEquals("Files table should have one row", 1, expected.size()); - Assert.assertEquals("Actual results should have one row", 1, actual.size()); - TestHelpers.assertEqualsSafe(nonDerivedSchema(filesTableDs), expected.get(0), actual.get(0)); - } - - @Test - public void testAllMetadataTablesWithStagedCommits() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "stage_aggregate_table_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - - table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); - spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - List actualAllData = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue( - "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals( - "Stage table should have null currentSnapshot", null, table.currentSnapshot()); - Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); - Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); - Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); - } - - @Test - public void testAllDataFilesTable() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table entriesTable = loadTable(tableIdentifier, "entries"); - - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // delete the first file to test that not only live files are listed - table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - // ensure table data isn't stale - table.refresh(); - - Dataset filesTableDs = - spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_data_files")); - List actual = selectNonDerived(filesTableDs).collectAsList(); - actual.sort(Comparator.comparing(o -> o.getString(1))); - - List expected = Lists.newArrayList(); - Iterable dataManifests = - Iterables.concat( - Iterables.transform(table.snapshots(), snapshot -> snapshot.dataManifests(table.io()))); - for (ManifestFile manifest : dataManifests) { - InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = - Avro.read(in).project(entriesTable.schema()).build()) { - for (GenericData.Record record : rows) { - if ((Integer) record.get("status") < 2 /* added or existing */) { - GenericData.Record file = (GenericData.Record) record.get("data_file"); - asMetadataRecord(file); - expected.add(file); - } - } - } - } - - expected.sort(Comparator.comparing(o -> o.get("file_path").toString())); - - Assert.assertEquals("Files table should have two rows", 2, expected.size()); - Assert.assertEquals("Actual results should have two rows", 2, actual.size()); - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(nonDerivedSchema(filesTableDs), expected.get(i), actual.get(i)); - } - } - - @Test - public void testHistoryTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "history_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Table historyTable = loadTable(tableIdentifier, "history"); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long secondSnapshotId = table.currentSnapshot().snapshotId(); - - // rollback the table state to the first snapshot - table.manageSnapshots().rollbackTo(firstSnapshotId).commit(); - long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long thirdSnapshotId = table.currentSnapshot().snapshotId(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = - Lists.newArrayList( - builder - .set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder - .set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set( - "is_current_ancestor", - false) // commit rolled back, not an ancestor of the current table state - .build(), - builder - .set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder - .set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build()); - - Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); - TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); - TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(1), actual.get(1)); - TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(2), actual.get(2)); - } - - @Test - public void testSnapshotsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Table snapTable = loadTable(tableIdentifier, "snapshots"); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - String firstManifestList = table.currentSnapshot().manifestListLocation(); - - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long secondSnapshotId = table.currentSnapshot().snapshotId(); - String secondManifestList = table.currentSnapshot().manifestListLocation(); - - // rollback the table state to the first snapshot - table.manageSnapshots().rollbackTo(firstSnapshotId).commit(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = - Lists.newArrayList( - builder - .set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set( - "summary", - ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1")) - .build(), - builder - .set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set( - "summary", - ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0")) - .build()); - - Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); - TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); - TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(1), actual.get(1)); - } - - @Test - public void testPrunedSnapshotsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - long firstSnapshotId = table.currentSnapshot().snapshotId(); - - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis(); - - // rollback the table state to the first snapshot - table.manageSnapshots().rollbackTo(firstSnapshotId).commit(); - - Dataset actualDf = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); - - Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - - List actual = actualDf.collectAsList(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = - Lists.newArrayList( - builder - .set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set( - "summary", - ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1")) - .build(), - builder - .set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set( - "summary", - ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0")) - .build()); - - Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); - TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); - TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(1), actual.get(1)); - } - - @Test - public void testManifestsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = - spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), - SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); - - table.refresh(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = - new GenericRecordBuilder( - AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), - "partition_summary")); - List expected = - Lists.transform( - table.currentSnapshot().allManifests(table.io()), - manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set( - "added_data_files_count", - manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set( - "existing_data_files_count", - manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set( - "deleted_data_files_count", - manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set( - "added_delete_files_count", - manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set( - "existing_delete_files_count", - manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set( - "deleted_delete_files_count", - manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set( - "partition_summaries", - Lists.transform( - manifest.partitions(), - partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build())) - .build()); - - Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); - } - - @Test - public void testPruneManifestsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = - spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), - SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - if (!spark.version().startsWith("2")) { - // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows( - "Can't prune struct inside list", - SparkException.class, - "Cannot project a partial list element struct", - () -> - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); - } - - Dataset actualDf = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); - - Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); - - table.refresh(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = - new GenericRecordBuilder( - AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), - "partition_summary")); - List expected = - Lists.transform( - table.currentSnapshot().allManifests(table.io()), - manifest -> - builder - .set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set( - "partition_summaries", - Lists.transform( - manifest.partitions(), - partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build())) - .build()); - - Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); - TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); - } - - @Test - public void testAllManifestsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - - DataFile dataFile = - Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); - PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); - StructLike dataFilePartition = dataFile.partition(); - - PositionDelete delete = PositionDelete.create(); - delete.set(dataFile.path(), 0L, null); - - DeleteFile deleteFile = - writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - - table.newRowDelta().addDeletes(deleteFile).commit(); - - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - - Stream> snapshotIdToManifests = - StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap( - snapshot -> - snapshot.allManifests(table.io()).stream() - .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); - - table.refresh(); - - List expected = - snapshotIdToManifests - .map( - snapshotManifest -> - manifestRecord( - manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); - expected.sort(Comparator.comparing(o -> o.get("path").toString())); - - Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); - } - } - - @Test - public void testUnpartitionedPartitionsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); - createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - Dataset df = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - Types.StructType expectedSchema = - Types.StructType.of( - required(2, "record_count", Types.LongType.get(), "Count of records in data files"), - required(3, "file_count", Types.IntegerType.get(), "Count of data files")); - - Table partitionsTable = loadTable(tableIdentifier, "partitions"); - - Assert.assertEquals( - "Schema should not have partition field", - expectedSchema, - partitionsTable.schema().asStruct()); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); - - Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); - TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); - } - - @Test - public void testPartitionsTable() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - - df1.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - long firstCommitId = table.currentSnapshot().snapshotId(); - - // add a second file - df2.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = - new GenericRecordBuilder( - AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); - List expected = Lists.newArrayList(); - expected.add( - builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add( - builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - - Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); - Assert.assertEquals("Actual results should have two rows", 2, actual.size()); - for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe( - partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); - } - - // check time travel - List actualAfterFirstCommit = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe( - partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); - - // check predicate push down - List filtered = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); - Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe( - partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); - Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); - for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe( - partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); - } - } - - @Test - public synchronized void testSnapshotReadAfterAddColumn() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List originalRecords = - Lists.newArrayList( - RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); - - StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); - Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); - - Snapshot snapshot1 = table.currentSnapshot(); - - table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - - List newRecords = - Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); - - StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); - Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2 - .select("id", "data", "category") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - List updatedRecords = - Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); - Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); - } - - @Test - public synchronized void testSnapshotReadAfterDropColumn() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); - Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - - List originalRecords = - Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); - - StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); - Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf - .select("id", "data", "category") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); - - long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); - table.updateSchema().deleteColumn("data").commit(); - long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - - List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); - - StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); - Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2 - .select("id", "category") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - List updatedRecords = - Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); - Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); - - // At tsAfterDropColumn, there has been a schema change, but no new snapshot, - // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); - Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); - } - - @Test - public synchronized void testSnapshotReadAfterAddAndDropColumn() { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List originalRecords = - Lists.newArrayList( - RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); - - StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); - Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf - .select("id", "data") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); - - Snapshot snapshot1 = table.currentSnapshot(); - - table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - - List newRecords = - Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); - - StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); - Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2 - .select("id", "data", "category") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(loadLocation(tableIdentifier)); - - table.refresh(); - - List updatedRecords = - Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); - - table.updateSchema().deleteColumn("data").commit(); - - List recordsAfterDropColumn = - Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals( - "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); - Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); - } - - @Test - public void testRemoveOrphanFilesActionSupport() throws InterruptedException { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - - List records = Lists.newArrayList(new SimpleRecord(1, "1")); - - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - df.write().mode("append").parquet(table.location() + "/data"); - - // sleep for 1 second to ensure files will be old enough - Thread.sleep(1000); - - SparkActions actions = SparkActions.get(); - - DeleteOrphanFiles.Result result1 = - actions - .deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue( - "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - - DeleteOrphanFiles.Result result2 = - actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals( - "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); - - Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = - resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Rows must match", records, actualRecords); - } - - @Test - public void testAllManifestTableSnapshotFiltering() throws Exception { - TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); - Table table = createTable(tableIdentifier, SCHEMA, SPEC); - Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = - spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - - List> snapshotIdToManifests = Lists.newArrayList(); - - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll( - snapshot1.allManifests(table.io()).stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); - - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - table.refresh(); - Snapshot snapshot2 = table.currentSnapshot(); - Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests(table.io()).size()); - snapshotIdToManifests.addAll( - snapshot2.allManifests(table.io()).stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); - - // Add manifests that will not be selected - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - df.select("id", "data") - .write() - .format("iceberg") - .mode("append") - .save(loadLocation(tableIdentifier)); - - StringJoiner snapshotIds = new StringJoiner(",", "(", ")"); - snapshotIds.add(String.valueOf(snapshot1.snapshotId())); - snapshotIds.add(String.valueOf(snapshot2.snapshotId())); - snapshotIds.toString(); - - List actual = - spark - .read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); - table.refresh(); - - List expected = - snapshotIdToManifests.stream() - .map( - snapshotManifest -> - manifestRecord( - manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); - expected.sort(Comparator.comparing(o -> o.get("path").toString())); - - Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); - } - } - - private GenericData.Record manifestRecord( - Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = - new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = - new GenericRecordBuilder( - AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), - "partition_summary")); - return builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set( - "existing_data_files_count", - manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set( - "deleted_data_files_count", - manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set( - "added_delete_files_count", - manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set( - "existing_delete_files_count", - manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set( - "deleted_delete_files_count", - manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set( - "partition_summaries", - Lists.transform( - manifest.partitions(), - partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build())) - .set("reference_snapshot_id", referenceSnapshotId) - .build(); - } - - public static void asMetadataRecord(GenericData.Record file) { - file.put(0, FileContent.DATA.id()); - file.put(3, 0); // specId - } - - private PositionDeleteWriter newPositionDeleteWriter( - Table table, PartitionSpec spec, StructLike partition) { - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); - EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); - - SparkFileWriterFactory fileWriterFactory = SparkFileWriterFactory.builderFor(table).build(); - return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); - } - - private DeleteFile writePositionDeletes( - Table table, - PartitionSpec spec, - StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = - newPositionDeleteWriter(table, spec, partition); - - try (PositionDeleteWriter writer = positionDeleteWriter) { - for (PositionDelete delete : deletes) { - writer.write(delete); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - - return positionDeleteWriter.toDeleteFile(); - } - - public static Dataset selectNonDerived(Dataset metadataTable) { - StructField[] fields = metadataTable.schema().fields(); - return metadataTable.select( - Stream.of(fields) - .filter(f -> !f.name().equals("readable_metrics")) // derived field - .map(f -> new Column(f.name())) - .toArray(Column[]::new)); - } - - public static Types.StructType nonDerivedSchema(Dataset metadataTable) { - return SparkSchemaUtil.convert(selectNonDerived(metadataTable).schema()).asStruct(); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java deleted file mode 100644 index 559668ee31a1..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.sql.Date; -import java.sql.Timestamp; -import java.util.List; -import org.apache.iceberg.spark.IcebergSpark; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.util.DateTimeUtils; -import org.apache.spark.sql.types.CharType; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.DecimalType; -import org.apache.spark.sql.types.VarcharType; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Test; - -public class TestIcebergSpark { - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestIcebergSpark.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestIcebergSpark.spark; - TestIcebergSpark.spark = null; - currentSpark.stop(); - } - - @Test - public void testRegisterIntegerBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); - List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); - } - - @Test - public void testRegisterShortBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); - List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); - } - - @Test - public void testRegisterByteBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); - List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); - } - - @Test - public void testRegisterLongBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); - List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); - } - - @Test - public void testRegisterStringBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); - List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterCharBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); - List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterVarCharBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); - List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterDateBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = - spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) - Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF( - spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); - List results = - spark - .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") - .collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) - Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply( - DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterBinaryBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) - Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterDecimalBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); - Assert.assertEquals(1, results.size()); - Assert.assertEquals( - (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), - results.get(0).getInt(0)); - } - - @Test - public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy( - () -> - IcebergSpark.registerBucketUDF( - spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot bucket by type: boolean"); - } - - @Test - public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy( - () -> - IcebergSpark.registerBucketUDF( - spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot bucket by type: double"); - } - - @Test - public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy( - () -> - IcebergSpark.registerBucketUDF( - spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot bucket by type: float"); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java deleted file mode 100644 index 7313c18cc09d..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.File; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.catalyst.TableIdentifier; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestIdentityPartitionData extends SparkTestBase { - private static final Configuration CONF = new Configuration(); - private static final HadoopTables TABLES = new HadoopTables(CONF); - - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false}, - {"orc", false}, - {"orc", true}, - }; - } - - private final String format; - private final boolean vectorized; - - public TestIdentityPartitionData(String format, boolean vectorized) { - this.format = format; - this.vectorized = vectorized; - } - - private static final Schema LOG_SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get())); - - private static final List LOGS = - ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5")); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = - PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); - private Table table = null; - private Dataset logs = null; - - /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in - * the underlying parquet files. This makes sure that if the identity mapping fails, the test will - * also fail. - */ - private void setupParquet() throws Exception { - File location = temp.newFolder("logs"); - File hiveLocation = temp.newFolder("hive"); - String hiveTable = "hivetable"; - Assert.assertTrue("Temp folder should exist", location.exists()); - - Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = - spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id") - .write() - .partitionBy("date", "level") - .format("parquet") - .option("path", hiveLocation.toString()) - .saveAsTable(hiveTable); - - this.table = - TABLES.create( - SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), - properties, - location.toString()); - - SparkTableUtil.importSparkTable( - spark, new TableIdentifier(hiveTable), table, location.toString()); - } - - @Before - public void setupTable() throws Exception { - if (format.equals("parquet")) { - setupParquet(); - } else { - File location = temp.newFolder("logs"); - Assert.assertTrue("Temp folder should exist", location.exists()); - - Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = - spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - - logs.orderBy("date", "level", "id") - .write() - .format("iceberg") - .mode("append") - .save(location.toString()); - } - } - - @Test - public void testFullProjection() { - List expected = logs.orderBy("id").collectAsList(); - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); - Assert.assertEquals("Rows should match", expected, actual); - } - - @Test - public void testProjections() { - String[][] cases = - new String[][] { - // individual fields - new String[] {"date"}, - new String[] {"level"}, - new String[] {"message"}, - // field pairs - new String[] {"date", "message"}, - new String[] {"level", "message"}, - new String[] {"date", "level"}, - // out-of-order pairs - new String[] {"message", "date"}, - new String[] {"message", "level"}, - new String[] {"level", "date"}, - // full projection, different orderings - new String[] {"date", "level", "message"}, - new String[] {"level", "date", "message"}, - new String[] {"date", "message", "level"}, - new String[] {"level", "message", "date"}, - new String[] {"message", "date", "level"}, - new String[] {"message", "level", "date"} - }; - - for (String[] ordering : cases) { - List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering) - .orderBy("id") - .collectAsList(); - Assert.assertEquals( - "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java deleted file mode 100644 index 9e75145faff9..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Iterator; -import org.apache.iceberg.RecordWrapperTest; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.RandomGenericData; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.util.StructLikeWrapper; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.Assert; -import org.junit.Ignore; - -public class TestInternalRowWrapper extends RecordWrapperTest { - - @Ignore - @Override - public void testTimestampWithoutZone() { - // Spark does not support timestamp without zone. - } - - @Ignore - @Override - public void testTime() { - // Spark does not support time fields. - } - - @Override - protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { - int numRecords = 100; - Iterable recordList = RandomGenericData.generate(schema, numRecords, 101L); - Iterable rowList = RandomData.generateSpark(schema, numRecords, 101L); - - InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - InternalRowWrapper rowWrapper = new InternalRowWrapper(SparkSchemaUtil.convert(schema)); - - Iterator actual = recordList.iterator(); - Iterator expected = rowList.iterator(); - - StructLikeWrapper actualWrapper = StructLikeWrapper.forType(schema.asStruct()); - StructLikeWrapper expectedWrapper = StructLikeWrapper.forType(schema.asStruct()); - for (int i = 0; i < numRecords; i++) { - Assert.assertTrue("Should have more records", actual.hasNext()); - Assert.assertTrue("Should have more InternalRow", expected.hasNext()); - - StructLike recordStructLike = recordWrapper.wrap(actual.next()); - StructLike rowStructLike = rowWrapper.wrap(expected.next()); - - assertMethod.assertEquals( - "Should have expected StructLike values", - actualWrapper.set(recordStructLike), - expectedWrapper.set(rowStructLike)); - } - - Assert.assertFalse("Shouldn't have more record", actual.hasNext()); - Assert.assertFalse("Shouldn't have more InternalRow", expected.hasNext()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java deleted file mode 100644 index 74139b16ae99..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.List; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.avro.RemoveIds; -import org.apache.iceberg.hive.HiveTableBaseTest; -import org.apache.iceberg.mapping.MappingUtil; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.types.Types; -import org.apache.orc.OrcFile; -import org.apache.orc.TypeDescription; -import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; -import org.apache.orc.storage.ql.exec.vector.LongColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestNameMappingProjection extends HiveTableBaseTest { - private static final Configuration CONF = HiveTableBaseTest.hiveConf; - private static SparkSession spark = null; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @BeforeClass - public static void startSpark() { - String metastoreURI = CONF.get(HiveConf.ConfVars.METASTOREURIS.varname); - - // Create a spark session. - TestNameMappingProjection.spark = - SparkSession.builder() - .master("local[2]") - .enableHiveSupport() - .config("spark.hadoop.hive.metastore.uris", metastoreURI) - .config("hive.exec.dynamic.partition", "true") - .config("hive.exec.dynamic.partition.mode", "nonstrict") - .config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true") - .getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestNameMappingProjection.spark; - // Stop the spark session. - TestNameMappingProjection.spark = null; - currentSpark.stop(); - } - - @Test - public void testOrcReaderWithNameMapping() throws IOException { - File orcFile = temp.newFolder(); - TypeDescription orcSchema = TypeDescription.createStruct(); - orcSchema.addField("id", TypeDescription.createInt()); - orcSchema.addField("name", TypeDescription.createString()); - - Path dataFilePath = new Path(orcFile.toString(), "name-mapping-data.orc"); - try (org.apache.orc.Writer writer = - OrcFile.createWriter( - dataFilePath, OrcFile.writerOptions(new Configuration()).setSchema(orcSchema))) { - VectorizedRowBatch batch = orcSchema.createRowBatch(); - byte[] aliceVal = "Alice".getBytes(StandardCharsets.UTF_8); - byte[] bobVal = "Bob".getBytes(StandardCharsets.UTF_8); - - int rowId = batch.size++; - batch.cols[0].isNull[rowId] = false; - ((LongColumnVector) batch.cols[0]).vector[rowId] = 1; - batch.cols[1].isNull[rowId] = false; - ((BytesColumnVector) batch.cols[1]).setRef(rowId, bobVal, 0, bobVal.length); - - rowId = batch.size++; - batch.cols[0].isNull[rowId] = false; - ((LongColumnVector) batch.cols[0]).vector[rowId] = 2; - batch.cols[1].isNull[rowId] = false; - ((BytesColumnVector) batch.cols[1]).setRef(rowId, aliceVal, 0, aliceVal.length); - - writer.addRowBatch(batch); - batch.reset(); - } - - File fileWithData = new File(dataFilePath.toString()); - DataFile orcDataFile = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withFormat("orc") - .withFileSizeInBytes(fileWithData.length()) - .withPath(fileWithData.getAbsolutePath()) - .withRecordCount(2) - .build(); - - assertNameMappingProjection(orcDataFile, "orc_table"); - } - - @Test - public void testAvroReaderWithNameMapping() throws IOException { - File avroFile = temp.newFile(); - org.apache.avro.Schema avroSchema = - SchemaBuilder.record("TestRecord") - .namespace("org.apache.iceberg.spark.data") - .fields() - .requiredInt("id") - .requiredString("name") - .endRecord(); - - org.apache.avro.Schema avroSchemaWithoutIds = RemoveIds.removeIds(avroSchema); - - GenericRecord record1 = new GenericData.Record(avroSchemaWithoutIds); - record1.put("id", 1); - record1.put("name", "Bob"); - - GenericRecord record2 = new GenericData.Record(avroSchemaWithoutIds); - record2.put("id", 2); - record2.put("name", "Alice"); - - DatumWriter datumWriter = new GenericDatumWriter<>(avroSchemaWithoutIds); - DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter); - - dataFileWriter.create(avroSchemaWithoutIds, avroFile); - dataFileWriter.append(record1); - dataFileWriter.append(record2); - dataFileWriter.close(); - - DataFile avroDataFile = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withFormat("avro") - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.getAbsolutePath()) - .withRecordCount(2) - .build(); - - assertNameMappingProjection(avroDataFile, "avro_table"); - } - - private void assertNameMappingProjection(DataFile dataFile, String tableName) { - Schema filteredSchema = new Schema(required(1, "name", Types.StringType.get())); - NameMapping nameMapping = MappingUtil.create(filteredSchema); - - Schema tableSchema = - new Schema( - required(1, "name", Types.StringType.get()), - optional(2, "id", Types.IntegerType.get())); - - Table table = - catalog.createTable( - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), - tableSchema, - PartitionSpec.unpartitioned()); - - table - .updateProperties() - .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) - .commit(); - - table.newFastAppend().appendFile(dataFile).commit(); - - List actual = - spark - .read() - .format("iceberg") - .load(String.format("%s.%s", DB_NAME, tableName)) - .filter("name='Alice'") - .collectAsList(); - - Assert.assertEquals("Should project 1 record", 1, actual.size()); - Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0)); - Assert.assertNull("should be null", actual.get(0).get(1)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java deleted file mode 100644 index f585ed360f95..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localOutput; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import org.apache.avro.generic.GenericData; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.spark.data.AvroDataTest; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestParquetScan extends AvroDataTest { - private static final Configuration CONF = new Configuration(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestParquetScan.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestParquetScan.spark; - TestParquetScan.spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Parameterized.Parameters(name = "vectorized = {0}") - public static Object[] parameters() { - return new Object[] {false, true}; - } - - private final boolean vectorized; - - public TestParquetScan(boolean vectorized) { - this.vectorized = vectorized; - } - - @Override - protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue( - "Cannot handle non-string map keys in parquet-avro", - null - == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - dataFolder.mkdirs(); - - File parquetFile = - new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - - HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); - - // Important: use the table's schema for the rest of the test - // When tables are created, the column ids are reassigned. - Schema tableSchema = table.schema(); - - List expected = RandomData.generateList(tableSchema, 100, 1L); - - try (FileAppender writer = - Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { - writer.addAll(expected); - } - - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); - - table.newAppend().appendFile(file).commit(); - table - .updateProperties() - .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .commit(); - - Dataset df = spark.read().format("iceberg").load(location.toString()); - - List rows = df.collectAsList(); - Assert.assertEquals("Should contain 100 rows", 100, rows.size()); - - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i)); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java deleted file mode 100644 index ffe21432f00c..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.File; -import java.io.IOException; -import java.net.URI; -import java.sql.Timestamp; -import java.time.Instant; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.function.Predicate; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RawLocalFileSystem; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.expressions.Literal; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.transforms.Transform; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.unsafe.types.UTF8String; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestPartitionPruning { - - private static final Configuration CONF = new Configuration(); - private static final HadoopTables TABLES = new HadoopTables(CONF); - - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false}, - {"orc", false}, - {"orc", true} - }; - } - - private final String format; - private final boolean vectorized; - - public TestPartitionPruning(String format, boolean vectorized) { - this.format = format; - this.vectorized = vectorized; - } - - private static SparkSession spark = null; - private static JavaSparkContext sparkContext = null; - - private static Transform bucketTransform = - Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = - Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = - Transforms.hour(Types.TimestampType.withoutZone()); - - @BeforeClass - public static void startSpark() { - TestPartitionPruning.spark = SparkSession.builder().master("local[2]").getOrCreate(); - TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - String optionKey = String.format("fs.%s.impl", CountOpenLocalFileSystem.scheme); - CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); - spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); - spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark - .udf() - .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark - .udf() - .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); - // NOTE: date transforms take the type long, not Timestamp - spark - .udf() - .register( - "hour", - (Timestamp ts) -> - hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestPartitionPruning.spark; - TestPartitionPruning.spark = null; - currentSpark.stop(); - } - - private static final Schema LOG_SCHEMA = - new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); - - private static final List LOGS = - ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); - - private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = - (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); - return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = - PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); - - @Test - public void testPartitionPruningIdentityString() { - String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = - (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; - - runTest(filterCond, partCondition); - } - - @Test - public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; - String condForIds = - Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); - String filterCond = "id in " + condForIds; - Predicate partCondition = - (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = - Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; - - runTest(filterCond, partCondition); - } - - @Test - public void testPartitionPruningTruncatedString() { - String filterCond = "message like 'info event%'"; - Predicate partCondition = - (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; - - runTest(filterCond, partCondition); - } - - @Test - public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { - String filterCond = "message like 'inf%'"; - Predicate partCondition = - (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; - - runTest(filterCond, partCondition); - } - - @Test - public void testPartitionPruningHourlyPartition() { - String filterCond; - if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the - // filter. - filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; - } else { - filterCond = "timestamp >= '2020-02-03T01:00:00'"; - } - Predicate partCondition = - (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = - hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; - - runTest(filterCond, partCondition); - } - - private void runTest(String filterCond, Predicate partCondition) { - File originTableLocation = createTempDir(); - Assert.assertTrue("Temp folder should exist", originTableLocation.exists()); - - Table table = createTable(originTableLocation); - Dataset logs = createTestDataset(); - saveTestDatasetToTable(logs, table); - - List expected = - logs.select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); - Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); - - // remove records which may be recorded during storing to table - CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); - Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); - - Assert.assertEquals("Rows should match", expected, actual); - - assertAccessOnDataFiles(originTableLocation, table, partCondition); - } - - private File createTempDir() { - try { - return temp.newFolder(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - private Table createTable(File originTableLocation) { - String trackedTableLocation = CountOpenLocalFileSystem.convertPath(originTableLocation); - Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - return TABLES.create(LOG_SCHEMA, spec, properties, trackedTableLocation); - } - - private Dataset createTestDataset() { - List rows = - LOGS.stream() - .map( - logMessage -> { - Object[] underlying = - new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }) - .collect(Collectors.toList()); - - JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = - spark.internalCreateDataFrame( - JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df.selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr( - "id", - "date", - "level", - "message", - "timestamp", - "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", - "hour(timestamp) AS ts_hour"); - } - - private void saveTestDatasetToTable(Dataset logs, Table table) { - logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") - .select("id", "date", "level", "message", "timestamp") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); - } - - private void assertAccessOnDataFiles( - File originTableLocation, Table table, Predicate partCondition) { - // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = - CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() - .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); - - List files = - spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); - - Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); - Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); - - // Just to be sure, they should be mutually exclusive. - Assert.assertTrue(Sets.intersection(filesToRead, filesToNotRead).isEmpty()); - - Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - - // We don't check "all" data files bound to the condition are being read, as data files can be - // pruned on - // other conditions like lower/upper bound of columns. - Assert.assertFalse( - "Some of data files in partition range should be read. " - + "Read files in query: " - + readFilesInQuery - + " / data files in partition range: " - + filesToRead, - Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); - - // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue( - "Data files outside of partition range should not be read. " - + "Read files in query: " - + readFilesInQuery - + " / data files outside of partition range: " - + filesToNotRead, - Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); - } - - private Set extractFilePathsMatchingConditionOnPartition( - List files, Predicate condition) { - // idx 1: file_path, idx 3: partition - return files.stream() - .filter( - r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }) - .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); - } - - private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = - files.stream() - .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); - return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); - } - - public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = - String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); - public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); - - public static String convertPath(String absPath) { - return scheme + "://" + absPath; - } - - public static String convertPath(File file) { - return convertPath(file.getAbsolutePath()); - } - - public static String stripScheme(String pathWithScheme) { - if (!pathWithScheme.startsWith(scheme + ":")) { - throw new IllegalArgumentException("Received unexpected path: " + pathWithScheme); - } - - int idxToCut = scheme.length() + 1; - while (pathWithScheme.charAt(idxToCut) == '/') { - idxToCut++; - } - - // leave the last '/' - idxToCut--; - - return pathWithScheme.substring(idxToCut); - } - - public static void resetRecordsInPathPrefix(String pathPrefix) { - pathToNumOpenCalled.keySet().stream() - .filter(p -> p.startsWith(pathPrefix)) - .forEach(key -> pathToNumOpenCalled.remove(key)); - } - - @Override - public URI getUri() { - return URI.create(scheme + ":///"); - } - - @Override - public String getScheme() { - return scheme; - } - - @Override - public FSDataInputStream open(Path f, int bufferSize) throws IOException { - String path = f.toUri().getPath(); - pathToNumOpenCalled.compute( - path, - (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); - return super.open(f, bufferSize); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java deleted file mode 100644 index 73f10b1a8e26..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ /dev/null @@ -1,493 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.spark.data.TestHelpers; -import org.apache.iceberg.types.Types; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestPartitionValues { - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false}, - {"orc", false}, - {"orc", true} - }; - } - - private static final Schema SUPPORTED_PRIMITIVES = - new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestPartitionValues.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestPartitionValues.spark; - TestPartitionValues.spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private final String format; - private final boolean vectorized; - - public TestPartitionValues(String format, boolean vectorized) { - this.format = format; - this.vectorized = vectorized; - } - - @Test - public void testNullPartitionValue() throws Exception { - String desc = "null_part"; - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null)); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); - - Dataset result = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testReorderedColumns() throws Exception { - String desc = "reorder_columns"; - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("data", "id") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); - - Dataset result = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testReorderedColumnsNoNullability() throws Exception { - String desc = "reorder_columns_no_nullability"; - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("data", "id") - .write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); - - Dataset result = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testPartitionValueTypes() throws Exception { - String[] columnNames = - new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - - // create a table around the source data - String sourceLocation = temp.newFolder("source_table").toString(); - Table source = tables.create(SUPPORTED_PRIMITIVES, sourceLocation); - - // write out an Avro data file with all of the data types for source data - List expected = RandomData.generateList(source.schema(), 2, 128735L); - File avroData = temp.newFile("data.avro"); - Assert.assertTrue(avroData.delete()); - try (FileAppender appender = - Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { - appender.addAll(expected); - } - - // add the Avro data file to the source table - source - .newAppend() - .appendFile( - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) - .commit(); - - Dataset sourceDF = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); - - for (String column : columnNames) { - String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); - - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - PartitionSpec spec = PartitionSpec.builderFor(SUPPORTED_PRIMITIVES).identity(column).build(); - - Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); - - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - SUPPORTED_PRIMITIVES.asStruct(), expected.get(i), actual.get(i)); - } - } - } - - @Test - public void testNestedPartitionValues() throws Exception { - String[] columnNames = - new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; - - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); - - // create a table around the source data - String sourceLocation = temp.newFolder("source_table").toString(); - Table source = tables.create(nestedSchema, sourceLocation); - - // write out an Avro data file with all of the data types for source data - List expected = RandomData.generateList(source.schema(), 2, 128735L); - File avroData = temp.newFile("data.avro"); - Assert.assertTrue(avroData.delete()); - try (FileAppender appender = - Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { - appender.addAll(expected); - } - - // add the Avro data file to the source table - source - .newAppend() - .appendFile( - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) - .commit(); - - Dataset sourceDF = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); - - for (String column : columnNames) { - String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); - - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - PartitionSpec spec = - PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); - - Table table = tables.create(nestedSchema, spec, location.toString()); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); - - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - - for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); - } - } - } - - /** - * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is - * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to - * java.lang.CharSequence - */ - @Test - public void testPartitionedByNestedString() throws Exception { - // schema and partition spec - Schema nestedSchema = - new Schema( - Types.NestedField.required( - 1, - "struct", - Types.StructType.of( - Types.NestedField.required(2, "string", Types.StringType.get())))); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); - - // create table - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - String baseLocation = temp.newFolder("partition_by_nested_string").toString(); - tables.create(nestedSchema, spec, baseLocation); - - // input data frame - StructField[] structFields = { - new StructField( - "struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - }), - false, - Metadata.empty()) - }; - - List rows = Lists.newArrayList(); - rows.add(RowFactory.create(RowFactory.create("nested_string_value"))); - Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); - - // write into iceberg - sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); - - // verify - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); - - Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); - } - - @Test - public void testReadPartitionColumn() throws Exception { - Assume.assumeTrue("Temporary skip ORC", !"orc".equals(format)); - - Schema nestedSchema = - new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional( - 2, - "struct", - Types.StructType.of( - Types.NestedField.optional(3, "innerId", Types.LongType.get()), - Types.NestedField.optional(4, "innerName", Types.StringType.get())))); - PartitionSpec spec = - PartitionSpec.builderFor(nestedSchema).identity("struct.innerName").build(); - - // create table - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - String baseLocation = temp.newFolder("partition_by_nested_string").toString(); - Table table = tables.create(nestedSchema, spec, baseLocation); - table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - - // write into iceberg - MapFunction func = - value -> new ComplexRecord(value, new NestedRecord(value, "name_" + value)); - spark - .range(0, 10, 1, 1) - .map(func, Encoders.bean(ComplexRecord.class)) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); - - List actual = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .select("struct.innerName") - .as(Encoders.STRING()) - .collectAsList(); - - Assert.assertEquals("Number of rows should match", 10, actual.size()); - - List inputRecords = - IntStream.range(0, 10).mapToObj(i -> "name_" + i).collect(Collectors.toList()); - Assert.assertEquals("Read object should be matched", inputRecords, actual); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java deleted file mode 100644 index eecc405b1a09..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ /dev/null @@ -1,609 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.avro.Schema.Type.UNION; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public abstract class TestReadProjection { - final String format; - - TestReadProjection(String format) { - this.format = format; - } - - protected abstract Record writeAndRead( - String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Test - public void testFullProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - record.setField("id", 34L); - record.setField("data", "test"); - - Record projected = writeAndRead("full_projection", schema, schema, record); - - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - - int cmp = - Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); - Assert.assertEquals("Should contain the correct data value", 0, cmp); - } - - @Test - public void testReorderedFullProjection() throws Exception { - // Assume.assumeTrue( - // "Spark's Parquet read support does not support reordered columns", - // !format.equalsIgnoreCase("parquet")); - - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - record.setField("id", 34L); - record.setField("data", "test"); - - Schema reordered = - new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); - - Assert.assertEquals("Should contain the correct 0 value", "test", projected.get(0).toString()); - Assert.assertEquals("Should contain the correct 1 value", 34L, projected.get(1)); - } - - @Test - public void testReorderedProjection() throws Exception { - // Assume.assumeTrue( - // "Spark's Parquet read support does not support reordered columns", - // !format.equalsIgnoreCase("parquet")); - - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - record.setField("id", 34L); - record.setField("data", "test"); - - Schema reordered = - new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get())); - - Record projected = writeAndRead("reordered_projection", schema, reordered, record); - - Assert.assertNull("Should contain the correct 0 value", projected.get(0)); - Assert.assertEquals("Should contain the correct 1 value", "test", projected.get(1).toString()); - Assert.assertNull("Should contain the correct 2 value", projected.get(2)); - } - - @Test - public void testEmptyProjection() throws Exception { - Schema schema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(schema); - record.setField("id", 34L); - record.setField("data", "test"); - - Record projected = writeAndRead("empty_projection", schema, schema.select(), record); - - Assert.assertNotNull("Should read a non-null record", projected); - // this is expected because there are no values - Assertions.assertThatThrownBy(() -> projected.get(0)) - .isInstanceOf(ArrayIndexOutOfBoundsException.class); - } - - @Test - public void testBasicProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - record.setField("data", "test"); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); - Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - - Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); - - projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); - - Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = - Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); - Assert.assertEquals("Should contain the correct data value", 0, cmp); - } - - @Test - public void testRename() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - record.setField("data", "test"); - - Schema readSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get())); - - Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = - Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); - Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); - } - - @Test - public void testNestedStructProjection() throws Exception { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 3, - "location", - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get())))); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - Record location = GenericRecord.create(writeSchema.findType("location").asStructType()); - location.setField("lat", 52.995143f); - location.setField("long", -1.539054f); - record.setField("location", location); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - Assert.assertNull("Should not project location", projectedLocation); - - Schema latOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); - - projected = writeAndRead("latitude_only", writeSchema, latOnly, record); - projectedLocation = (Record) projected.getField("location"); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals( - "Should project latitude", - 52.995143f, - (float) projectedLocation.getField("lat"), - 0.000001f); - - Schema longOnly = - new Schema( - Types.NestedField.optional( - 3, - "location", - Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); - - projected = writeAndRead("longitude_only", writeSchema, longOnly, record); - projectedLocation = (Record) projected.getField("location"); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals( - "Should project longitude", - -1.539054f, - (float) projectedLocation.getField("long"), - 0.000001f); - - Schema locationOnly = writeSchema.select("location"); - projected = writeAndRead("location_only", writeSchema, locationOnly, record); - projectedLocation = (Record) projected.getField("location"); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals( - "Should project latitude", - 52.995143f, - (float) projectedLocation.getField("lat"), - 0.000001f); - Assert.assertEquals( - "Should project longitude", - -1.539054f, - (float) projectedLocation.getField("long"), - 0.000001f); - } - - @Test - public void testMapProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); - - Map properties = ImmutableMap.of("a", "A", "b", "B"); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - record.setField("properties", properties); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - Assert.assertNull("Should not project properties map", projected.getField("properties")); - - Schema keyOnly = writeSchema.select("properties.key"); - projected = writeAndRead("key_only", writeSchema, keyOnly, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals( - "Should project entire map", - properties, - toStringMap((Map) projected.getField("properties"))); - - Schema valueOnly = writeSchema.select("properties.value"); - projected = writeAndRead("value_only", writeSchema, valueOnly, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals( - "Should project entire map", - properties, - toStringMap((Map) projected.getField("properties"))); - - Schema mapOnly = writeSchema.select("properties"); - projected = writeAndRead("map_only", writeSchema, mapOnly, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals( - "Should project entire map", - properties, - toStringMap((Map) projected.getField("properties"))); - } - - private Map toStringMap(Map map) { - Map stringMap = Maps.newHashMap(); - for (Map.Entry entry : map.entrySet()) { - if (entry.getValue() instanceof CharSequence) { - stringMap.put(entry.getKey().toString(), entry.getValue().toString()); - } else { - stringMap.put(entry.getKey().toString(), entry.getValue()); - } - } - return stringMap; - } - - @Test - public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()))))); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - Record l1 = GenericRecord.create(writeSchema.findType("locations.value").asStructType()); - l1.setField("lat", 53.992811f); - l1.setField("long", -1.542616f); - Record l2 = GenericRecord.create(l1.struct()); - l2.setField("lat", 52.995143f); - l2.setField("long", -1.539054f); - record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - Assert.assertNull("Should not project locations map", projected.getField("locations")); - - projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals( - "Should project locations map", - record.getField("locations"), - toStringMap((Map) projected.getField("locations"))); - - projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - Map locations = toStringMap((Map) projected.getField("locations")); - Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals( - "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); - Record projectedL1 = (Record) locations.get("L1"); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals( - "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); - Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); - Record projectedL2 = (Record) locations.get("L2"); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals( - "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); - Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - - projected = - writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - locations = toStringMap((Map) projected.getField("locations")); - Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals( - "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); - projectedL1 = (Record) locations.get("L1"); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals( - "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); - projectedL2 = (Record) locations.get("L2"); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals( - "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = - new Schema( - Types.NestedField.optional( - 5, - "locations", - Types.MapType.ofOptional( - 6, - 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); - - projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); - Assert.assertNull("Should not project id", projected.getField("id")); - locations = toStringMap((Map) projected.getField("locations")); - Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals( - "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); - projectedL1 = (Record) locations.get("L1"); - Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals( - "L1 should contain latitude", - 53.992811f, - (float) projectedL1.getField("latitude"), - 0.000001); - Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); - projectedL2 = (Record) locations.get("L2"); - Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals( - "L2 should contain latitude", - 52.995143f, - (float) projectedL2.getField("latitude"), - 0.000001); - Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - } - - @Test - public void testListProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); - - List values = ImmutableList.of(56L, 57L, 58L); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - record.setField("values", values); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - Assert.assertNull("Should not project values list", projected.getField("values")); - - Schema elementOnly = writeSchema.select("values.element"); - projected = writeAndRead("element_only", writeSchema, elementOnly, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire list", values, projected.getField("values")); - - Schema listOnly = writeSchema.select("values"); - projected = writeAndRead("list_only", writeSchema, listOnly, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire list", values, projected.getField("values")); - } - - @Test - @SuppressWarnings("unchecked") - public void testListOfStructsProjection() throws IOException { - Schema writeSchema = - new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); - - Record record = GenericRecord.create(writeSchema); - record.setField("id", 34L); - Record p1 = GenericRecord.create(writeSchema.findType("points.element").asStructType()); - p1.setField("x", 1); - p1.setField("y", 2); - Record p2 = GenericRecord.create(p1.struct()); - p2.setField("x", 3); - p2.setField("y", null); - record.setField("points", ImmutableList.of(p1, p2)); - - Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals( - "Should contain the correct id value", 34L, (long) projected.getField("id")); - Assert.assertNull("Should not project points list", projected.getField("points")); - - projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals( - "Should project points list", record.getField("points"), projected.getField("points")); - - projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project points list", projected.getField("points")); - List points = (List) projected.getField("points"); - Assert.assertEquals("Should read 2 points", 2, points.size()); - Record projectedP1 = points.get(0); - Assert.assertEquals("Should project x", 1, (int) projectedP1.getField("x")); - Assert.assertNull("Should not project y", projectedP1.getField("y")); - Record projectedP2 = points.get(1); - Assert.assertEquals("Should project x", 3, (int) projectedP2.getField("x")); - Assert.assertNull("Should not project y", projectedP2.getField("y")); - - projected = writeAndRead("y_only", writeSchema, writeSchema.select("points.y"), record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project points list", projected.getField("points")); - points = (List) projected.getField("points"); - Assert.assertEquals("Should read 2 points", 2, points.size()); - projectedP1 = points.get(0); - Assert.assertNull("Should not project x", projectedP1.getField("x")); - Assert.assertEquals("Should project y", 2, (int) projectedP1.getField("y")); - projectedP2 = points.get(1); - Assert.assertNull("Should not project x", projectedP2.getField("x")); - Assert.assertNull("Should project null y", projectedP2.getField("y")); - - Schema yRenamed = - new Schema( - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); - - projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project points list", projected.getField("points")); - points = (List) projected.getField("points"); - Assert.assertEquals("Should read 2 points", 2, points.size()); - projectedP1 = points.get(0); - Assert.assertNull("Should not project x", projectedP1.getField("x")); - Assert.assertNull("Should not project y", projectedP1.getField("y")); - Assert.assertEquals("Should project z", 2, (int) projectedP1.getField("z")); - projectedP2 = points.get(1); - Assert.assertNull("Should not project x", projectedP2.getField("x")); - Assert.assertNull("Should not project y", projectedP2.getField("y")); - Assert.assertNull("Should project null z", projectedP2.getField("z")); - - Schema zAdded = - new Schema( - Types.NestedField.optional( - 22, - "points", - Types.ListType.ofOptional( - 21, - Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); - - projected = writeAndRead("z_added", writeSchema, zAdded, record); - Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertNotNull("Should project points list", projected.getField("points")); - points = (List) projected.getField("points"); - Assert.assertEquals("Should read 2 points", 2, points.size()); - projectedP1 = points.get(0); - Assert.assertEquals("Should project x", 1, (int) projectedP1.getField("x")); - Assert.assertEquals("Should project y", 2, (int) projectedP1.getField("y")); - Assert.assertNull("Should contain null z", projectedP1.getField("z")); - projectedP2 = points.get(1); - Assert.assertEquals("Should project x", 3, (int) projectedP2.getField("x")); - Assert.assertNull("Should project null y", projectedP2.getField("y")); - Assert.assertNull("Should contain null z", projectedP2.getField("z")); - } - - private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument( - schema.getType() == UNION, "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument( - schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); - if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { - return schema.getTypes().get(1); - } else { - return schema.getTypes().get(0); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java deleted file mode 100644 index 2ecab364cdfe..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.Serializable; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.events.Listeners; -import org.apache.iceberg.events.ScanEvent; -import org.apache.iceberg.expressions.And; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSelect { - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "doubleVal", Types.DoubleType.get())); - - private static SparkSession spark; - - private static int scanEventCount = 0; - private static ScanEvent lastScanEvent = null; - - private Table table; - - static { - Listeners.register( - event -> { - scanEventCount += 1; - lastScanEvent = event; - }, - ScanEvent.class); - } - - @BeforeClass - public static void startSpark() { - spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = spark; - spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private String tableLocation = null; - - @Before - public void init() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - - table = TABLES.create(SCHEMA, tableLocation); - - List rows = - Lists.newArrayList( - new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN)); - - Dataset df = spark.createDataFrame(rows, Record.class); - - df.select("id", "data", "doubleVal") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); - - table.refresh(); - - Dataset results = spark.read().format("iceberg").load(tableLocation); - results.createOrReplaceTempView("table"); - - scanEventCount = 0; - lastScanEvent = null; - } - - @Test - public void testSelect() { - List expected = - ImmutableList.of( - new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN)); - - Assert.assertEquals( - "Should return all expected rows", - expected, - sql("select * from table", Encoders.bean(Record.class))); - } - - @Test - public void testSelectRewrite() { - List expected = ImmutableList.of(new Record(3, "c", Double.NaN)); - - Assert.assertEquals( - "Should return all expected rows", - expected, - sql("SELECT * FROM table where doubleVal = double('NaN')", Encoders.bean(Record.class))); - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - - Expression filter = lastScanEvent.filter(); - Assert.assertEquals("Should create AND expression", Expression.Operation.AND, filter.op()); - Expression left = ((And) filter).left(); - Expression right = ((And) filter).right(); - - Assert.assertEquals( - "Left expression should be NOT_NULL", Expression.Operation.NOT_NULL, left.op()); - Assert.assertTrue( - "Left expression should contain column name 'doubleVal'", - left.toString().contains("doubleVal")); - Assert.assertEquals( - "Right expression should be IS_NAN", Expression.Operation.IS_NAN, right.op()); - Assert.assertTrue( - "Right expression should contain column name 'doubleVal'", - right.toString().contains("doubleVal")); - } - - @Test - public void testProjection() { - List expected = ImmutableList.of(1, 2, 3); - - Assert.assertEquals( - "Should return all expected rows", expected, sql("SELECT id FROM table", Encoders.INT())); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals( - "Should project only the id column", - table.schema().select("id").asStruct(), - lastScanEvent.projection().asStruct()); - } - - @Test - public void testExpressionPushdown() { - List expected = ImmutableList.of("b"); - - Assert.assertEquals( - "Should return all expected rows", - expected, - sql("SELECT data FROM table WHERE id = 2", Encoders.STRING())); - - Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals( - "Should project only id and data columns", - table.schema().select("id", "data").asStruct(), - lastScanEvent.projection().asStruct()); - } - - private List sql(String str, Encoder encoder) { - return spark.sql(str).as(encoder).collectAsList(); - } - - public static class Record implements Serializable { - private Integer id; - private String data; - private Double doubleVal; - - public Record() {} - - Record(Integer id, String data, Double doubleVal) { - this.id = id; - this.data = data; - this.doubleVal = doubleVal; - } - - public void setId(Integer id) { - this.id = id; - } - - public void setData(String data) { - this.data = data; - } - - public void setDoubleVal(Double doubleVal) { - this.doubleVal = doubleVal; - } - - public Integer getId() { - return id; - } - - public String getData() { - return data; - } - - public Double getDoubleVal() { - return doubleVal; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - Record record = (Record) o; - return Objects.equal(id, record.id) - && Objects.equal(data, record.data) - && Objects.equal(doubleVal, record.doubleVal); - } - - @Override - public int hashCode() { - return Objects.hashCode(id, data, doubleVal); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java deleted file mode 100644 index 26645167f6af..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.IOException; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSnapshotSelection { - - private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestSnapshotSelection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSnapshotSelection.spark; - TestSnapshotSelection.spark = null; - currentSpark.stop(); - } - - @Test - public void testSnapshotSelectionById() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, tableLocation); - - // produce the first snapshot - List firstBatchRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); - firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - // produce the second snapshot - List secondBatchRecords = - Lists.newArrayList( - new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); - Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); - secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); - - // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); - List currentSnapshotRecords = - currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(firstBatchRecords); - expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals( - "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); - - // verify records in the previous snapshot - Snapshot currentSnapshot = table.currentSnapshot(); - Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = - spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); - List previousSnapshotRecords = - previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals( - "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); - } - - @Test - public void testSnapshotSelectionByTimestamp() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, tableLocation); - - // produce the first snapshot - List firstBatchRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); - firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - // remember the time when the first snapshot was valid - long firstSnapshotTimestamp = System.currentTimeMillis(); - - // produce the second snapshot - List secondBatchRecords = - Lists.newArrayList( - new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); - Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); - secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); - - // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); - List currentSnapshotRecords = - currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expectedRecords = Lists.newArrayList(); - expectedRecords.addAll(firstBatchRecords); - expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals( - "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); - - // verify records in the previous snapshot - Dataset previousSnapshotResult = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = - previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals( - "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); - } - - @Test - public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, tableLocation); - - Assertions.assertThatThrownBy( - () -> spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); - } - - @Test - public void testSnapshotSelectionByInvalidTimestamp() throws IOException { - long timestamp = System.currentTimeMillis(); - - String tableLocation = temp.newFolder("iceberg-table").toString(); - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, tableLocation); - - Assertions.assertThatThrownBy( - () -> - spark - .read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); - } - - @Test - public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, tableLocation); - - List firstBatchRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); - firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - long timestamp = System.currentTimeMillis(); - long snapshotId = table.currentSnapshot().snapshotId(); - - Assertions.assertThatThrownBy( - () -> - spark - .read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot scan using both snapshot-id and as-of-timestamp"); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java deleted file mode 100644 index 3fb2a630fe81..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileAppenderFactory; -import org.apache.iceberg.io.TestAppenderFactory; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkAppenderFactory extends TestAppenderFactory { - - private final StructType sparkType; - - public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - this.sparkType = SparkSchemaUtil.convert(SCHEMA); - } - - @Override - protected FileAppenderFactory createAppenderFactory( - List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { - return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema) - .build(); - } - - @Override - protected InternalRow createRow(Integer id, String data) { - InternalRow row = new GenericInternalRow(2); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - return row; - } - - @Override - protected StructLikeSet expectedRowSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - for (InternalRow row : rows) { - InternalRowWrapper wrapper = new InternalRowWrapper(sparkType); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java deleted file mode 100644 index 6c4239371476..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.StreamSupport; -import org.apache.avro.generic.GenericData; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.BaseCombinedScanTask; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.CloseableIterator; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkBaseDataReader { - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private Table table; - - // Simulates the closeable iterator of data to be read - private static class CloseableIntegerRange implements CloseableIterator { - boolean closed; - Iterator iter; - - CloseableIntegerRange(long range) { - this.closed = false; - this.iter = IntStream.range(0, (int) range).iterator(); - } - - @Override - public void close() { - this.closed = true; - } - - @Override - public boolean hasNext() { - return iter.hasNext(); - } - - @Override - public Integer next() { - return iter.next(); - } - } - - // Main reader class to test base class iteration logic. - // Keeps track of iterator closure. - private static class ClosureTrackingReader extends BaseDataReader { - private Map tracker = Maps.newHashMap(); - - ClosureTrackingReader(Table table, List tasks) { - super(table, new BaseCombinedScanTask(tasks)); - } - - @Override - CloseableIterator open(FileScanTask task) { - CloseableIntegerRange intRange = new CloseableIntegerRange(task.file().recordCount()); - tracker.put(getKey(task), intRange); - return intRange; - } - - public Boolean isIteratorClosed(FileScanTask task) { - return tracker.get(getKey(task)).closed; - } - - public Boolean hasIterator(FileScanTask task) { - return tracker.containsKey(getKey(task)); - } - - private String getKey(FileScanTask task) { - return task.file().path().toString(); - } - } - - @Test - public void testClosureOnDataExhaustion() throws IOException { - Integer totalTasks = 10; - Integer recordPerTask = 10; - List tasks = createFileScanTasks(totalTasks, recordPerTask); - - ClosureTrackingReader reader = new ClosureTrackingReader(table, tasks); - - int countRecords = 0; - while (reader.next()) { - countRecords += 1; - Assert.assertNotNull("Reader should return non-null value", reader.get()); - } - - Assert.assertEquals( - "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); - tasks.forEach( - t -> - Assert.assertTrue( - "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); - } - - @Test - public void testClosureDuringIteration() throws IOException { - Integer totalTasks = 2; - Integer recordPerTask = 1; - List tasks = createFileScanTasks(totalTasks, recordPerTask); - Assert.assertEquals(2, tasks.size()); - FileScanTask firstTask = tasks.get(0); - FileScanTask secondTask = tasks.get(1); - - ClosureTrackingReader reader = new ClosureTrackingReader(table, tasks); - - // Total of 2 elements - Assert.assertTrue(reader.next()); - Assert.assertFalse( - "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); - - Assert.assertTrue(reader.next()); - Assert.assertTrue( - "First iter should be closed after moving to second iter", - reader.isIteratorClosed(firstTask)); - Assert.assertFalse( - "Second iter should not be closed on its last element", - reader.isIteratorClosed(secondTask)); - - Assert.assertFalse(reader.next()); - Assert.assertTrue(reader.isIteratorClosed(firstTask)); - Assert.assertTrue(reader.isIteratorClosed(secondTask)); - } - - @Test - public void testClosureWithoutAnyRead() throws IOException { - Integer totalTasks = 10; - Integer recordPerTask = 10; - List tasks = createFileScanTasks(totalTasks, recordPerTask); - - ClosureTrackingReader reader = new ClosureTrackingReader(table, tasks); - - reader.close(); - - tasks.forEach( - t -> - Assert.assertFalse( - "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); - } - - @Test - public void testExplicitClosure() throws IOException { - Integer totalTasks = 10; - Integer recordPerTask = 10; - List tasks = createFileScanTasks(totalTasks, recordPerTask); - - ClosureTrackingReader reader = new ClosureTrackingReader(table, tasks); - - Integer halfDataSize = (totalTasks * recordPerTask) / 2; - for (int i = 0; i < halfDataSize; i++) { - Assert.assertTrue("Reader should have some element", reader.next()); - Assert.assertNotNull("Reader should return non-null value", reader.get()); - } - - reader.close(); - - // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. - // But all that have been created must be closed. - tasks.forEach( - t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue( - "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); - } - }); - } - - @Test - public void testIdempotentExplicitClosure() throws IOException { - Integer totalTasks = 10; - Integer recordPerTask = 10; - List tasks = createFileScanTasks(totalTasks, recordPerTask); - - ClosureTrackingReader reader = new ClosureTrackingReader(table, tasks); - - // Total 100 elements, only 5 iterators have been created - for (int i = 0; i < 45; i++) { - Assert.assertTrue("eader should have some element", reader.next()); - Assert.assertNotNull("Reader should return non-null value", reader.get()); - } - - for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { - reader.close(); - for (int i = 0; i < 5; i++) { - Assert.assertTrue( - "Iterator should be closed after read exhausion", - reader.isIteratorClosed(tasks.get(i))); - } - for (int i = 5; i < 10; i++) { - Assert.assertFalse( - "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); - } - } - } - - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) - throws IOException { - String desc = "make_scan_tasks"; - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); - - try { - this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); - // Important: use the table's schema for the rest of the test - // When tables are created, the column ids are reassigned. - Schema tableSchema = table.schema(); - List expected = RandomData.generateList(tableSchema, recordPerTask, 1L); - - AppendFiles appendFiles = table.newAppend(); - for (int i = 0; i < totalTasks; i++) { - File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = - Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { - writer.addAll(expected); - } - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); - appendFiles.appendFile(file); - } - appendFiles.commit(); - - return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) - .collect(Collectors.toList()); - } finally { - TestTables.clearTables(); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java deleted file mode 100644 index b1f2082b5d9b..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestReader; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkDataFile; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.types.Types; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.ColumnName; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkDataFile { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); - - private static SparkSession spark; - private static JavaSparkContext sparkContext = null; - - @BeforeClass - public static void startSpark() { - TestSparkDataFile.spark = SparkSession.builder().master("local[2]").getOrCreate(); - TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkDataFile.spark; - TestSparkDataFile.spark = null; - TestSparkDataFile.sparkContext = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - private String tableLocation = null; - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testValueConversion() throws IOException { - Table table = - TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - checkSparkDataFile(table); - } - - @Test - public void testValueConversionPartitionedTable() throws IOException { - Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - checkSparkDataFile(table); - } - - @Test - public void testValueConversionWithEmptyStats() throws IOException { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - checkSparkDataFile(table); - } - - private void checkSparkDataFile(Table table) throws IOException { - Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); - JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = - spark.internalCreateDataFrame( - JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); - - df.write().format("iceberg").mode("append").save(tableLocation); - - table.refresh(); - - List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should have 1 manifest", 1, manifests.size()); - - List dataFiles = Lists.newArrayList(); - try (ManifestReader reader = ManifestFiles.read(manifests.get(0), table.io())) { - for (DataFile dataFile : reader) { - checkDataFile(dataFile.copy(), DataFiles.builder(table.spec()).copy(dataFile).build()); - dataFiles.add(dataFile.copy()); - } - } - - Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - - // reorder columns to test arbitrary projections - List columns = - Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); - Collections.shuffle(columns); - - List sparkDataFiles = - dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - - Assert.assertEquals( - "The number of files should match", dataFiles.size(), sparkDataFiles.size()); - - Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); - StructType sparkDataFileType = sparkDataFiles.get(0).schema(); - SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType); - - for (int i = 0; i < dataFiles.size(); i++) { - checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i))); - } - } - - private void checkDataFile(DataFile expected, DataFile actual) { - Assert.assertEquals("Path must match", expected.path(), actual.path()); - Assert.assertEquals("Format must match", expected.format(), actual.format()); - Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals( - "Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals( - "Record null value counts must match", - expected.nullValueCounts(), - actual.nullValueCounts()); - Assert.assertEquals( - "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); - Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Split offsets must match", expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Sort order id must match", expected.sortOrderId(), actual.sortOrderId()); - - checkStructLike(expected.partition(), actual.partition()); - } - - private void checkStructLike(StructLike expected, StructLike actual) { - Assert.assertEquals("Struct size should match", expected.size(), actual.size()); - for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals( - "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java deleted file mode 100644 index a2fb66cf4be8..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ /dev/null @@ -1,655 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AppendFiles; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.exceptions.CommitStateUnknownException; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.types.Types; -import org.apache.spark.SparkException; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSparkDataWrite { - private static final Configuration CONF = new Configuration(); - private final FileFormat format; - private static SparkSession spark = null; - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Parameterized.Parameters(name = "format = {0}") - public static Object[] parameters() { - return new Object[] {"parquet", "avro", "orc"}; - } - - @BeforeClass - public static void startSpark() { - TestSparkDataWrite.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @Parameterized.AfterParam - public static void clearSourceCache() { - ManualSource.clearTables(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkDataWrite.spark; - TestSparkDataWrite.spark = null; - currentSpark.stop(); - } - - public TestSparkDataWrite(String format) { - this.format = FileFormat.fromString(format); - } - - @Test - public void testBasicWrite() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { - for (DataFile file : ManifestFiles.read(manifest, table.io())) { - // TODO: avro not support split - if (!format.equals(FileFormat.AVRO)) { - Assert.assertNotNull("Split offsets not present", file.splitOffsets()); - } - Assert.assertEquals("Should have reported record count as 1", 1, file.recordCount()); - // TODO: append more metric info - if (format.equals(FileFormat.PARQUET)) { - Assert.assertNotNull("Column sizes metric not present", file.columnSizes()); - Assert.assertNotNull("Counts metric not present", file.valueCounts()); - Assert.assertNotNull("Null value counts metric not present", file.nullValueCounts()); - Assert.assertNotNull("Lower bounds metric not present", file.lowerBounds()); - Assert.assertNotNull("Upper bounds metric not present", file.upperBounds()); - } - } - } - } - - @Test - public void testAppend() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c")); - - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - df.withColumn("id", df.col("id").plus(3)) - .select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testEmptyOverwrite() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - List expected = records; - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty - .select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Overwrite) - .option("overwrite-mode", "dynamic") - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testOverwrite() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c")); - - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)) - .select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Overwrite) - .option("overwrite-mode", "dynamic") - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testUnpartitionedOverwrite() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - // overwrite with the same data; should not produce two copies - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Overwrite) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - table - .updateProperties() - .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger - .commit(); - - List expected = Lists.newArrayListWithCapacity(4000); - for (int i = 0; i < 4000; i++) { - expected.add(new SimpleRecord(i, "a")); - } - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - - List files = Lists.newArrayList(); - for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { - for (DataFile file : ManifestFiles.read(manifest, table.io())) { - files.add(file); - } - } - - Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue( - "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); - } - - @Test - public void testPartitionedCreateWithTargetFileSizeViaOption() throws IOException { - partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.NONE); - } - - @Test - public void testPartitionedFanoutCreateWithTargetFileSizeViaOption() throws IOException { - partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.TABLE); - } - - @Test - public void testPartitionedFanoutCreateWithTargetFileSizeViaOption2() throws IOException { - partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType.JOB); - } - - @Test - public void testWriteProjection() throws IOException { - Assume.assumeTrue( - "Not supported in Spark 3; analysis requires all columns are present", - spark.version().startsWith("2")); - - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id") - .write() // select only id column - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testWriteProjectionWithMiddle() throws IOException { - Assume.assumeTrue( - "Not supported in Spark 3; analysis requires all columns are present", - spark.version().startsWith("2")); - - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = - new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get())); - Table table = tables.create(schema, spec, location.toString()); - - List expected = - Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null)); - - Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - - df.select("c1", "c3") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - } - - @Test - public void testViewsReturnRecentResults() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - tables.create(SCHEMA, spec, location.toString()); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); - query.createOrReplaceTempView("tmp"); - - List actual1 = - spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); - Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); - Assert.assertEquals("Result rows should match", expected1, actual1); - - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .save(location.toString()); - - List actual2 = - spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); - Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); - Assert.assertEquals("Result rows should match", expected2, actual2); - } - - public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) - throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "test"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = Lists.newArrayListWithCapacity(8000); - for (int i = 0; i < 2000; i++) { - expected.add(new SimpleRecord(i, "a")); - expected.add(new SimpleRecord(i, "b")); - expected.add(new SimpleRecord(i, "c")); - expected.add(new SimpleRecord(i, "d")); - } - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - switch (option) { - case NONE: - df.select("id", "data") - .sort("data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, 4) // ~4 bytes; low enough to trigger - .save(location.toString()); - break; - case TABLE: - table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, 4) // ~4 bytes; low enough to trigger - .save(location.toString()); - break; - case JOB: - df.select("id", "data") - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) - .mode(SaveMode.Append) - .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, 4) // ~4 bytes; low enough to trigger - .option(SparkWriteOptions.FANOUT_ENABLED, true) - .save(location.toString()); - break; - default: - break; - } - - table.refresh(); - - Dataset result = spark.read().format("iceberg").load(location.toString()); - - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - - List files = Lists.newArrayList(); - for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { - for (DataFile file : ManifestFiles.read(manifest, table.io())) { - files.add(file); - } - } - - Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue( - "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); - } - - @Test - public void testCommitUnknownException() throws IOException { - File parent = temp.newFolder(format.toString()); - File location = new File(parent, "commitunknown"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(records, SimpleRecord.class); - - AppendFiles append = table.newFastAppend(); - AppendFiles spyAppend = spy(append); - doAnswer( - invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }) - .when(spyAppend) - .commit(); - - Table spyTable = spy(table); - when(spyTable.newAppend()).thenReturn(spyAppend); - - String manualTableName = "unknown_exception"; - ManualSource.setTable(manualTableName, spyTable); - - // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause( - "Should throw a Commit State Unknown Exception", - SparkException.class, - "Writing job aborted", - CommitStateUnknownException.class, - "Datacenter on Fire", - () -> - df.select("id", "data") - .sort("data") - .write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); - - // Since write and commit succeeded, the rows should be readable - Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", records.size(), actual.size()); - Assert.assertEquals("Result rows should match", records, actual); - } - - public enum IcebergOptionsType { - NONE, - TABLE, - JOB - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java deleted file mode 100644 index 4a3263e368c0..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestFileWriterFactory; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkFileWriterFactory extends TestFileWriterFactory { - - public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return SparkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected InternalRow toRow(Integer id, String data) { - InternalRow row = new GenericInternalRow(2); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - return row; - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - StructType sparkType = SparkSchemaUtil.convert(table.schema()); - for (InternalRow row : rows) { - InternalRowWrapper wrapper = new InternalRowWrapper(sparkType); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java deleted file mode 100644 index c3bb35ca7df8..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestMergingMetrics; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.spark.sql.catalyst.InternalRow; - -public class TestSparkMergingMetrics extends TestMergingMetrics { - - public TestSparkMergingMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = - new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; - - FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) - .build() - .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); - try (FileAppender fileAppender = appender) { - records.stream() - .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) - .forEach(fileAppender::add); - } - return appender; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java deleted file mode 100644 index 276d8c632fc0..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPartitioningWriters; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkPartitioningWriters extends TestPartitioningWriters { - - public TestSparkPartitioningWriters(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return SparkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected InternalRow toRow(Integer id, String data) { - InternalRow row = new GenericInternalRow(2); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - return row; - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - StructType sparkType = SparkSchemaUtil.convert(table.schema()); - for (InternalRow row : rows) { - InternalRowWrapper wrapper = new InternalRowWrapper(sparkType); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java deleted file mode 100644 index 245c392774f5..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestPositionDeltaWriters; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkPositionDeltaWriters extends TestPositionDeltaWriters { - - public TestSparkPositionDeltaWriters(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return SparkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected InternalRow toRow(Integer id, String data) { - InternalRow row = new GenericInternalRow(2); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - return row; - } - - @Override - protected StructLikeSet toSet(Iterable rows) { - StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - StructType sparkType = SparkSchemaUtil.convert(table.schema()); - for (InternalRow row : rows) { - InternalRowWrapper wrapper = new InternalRowWrapper(sparkType); - set.add(wrapper.wrap(row)); - } - return set; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java deleted file mode 100644 index dde1eb7b36ec..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.UUID; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkValueConverter; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSparkReadProjection extends TestReadProjection { - - private static SparkSession spark = null; - - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false}, - {"orc", false}, - {"orc", true} - }; - } - - private final FileFormat format; - private final boolean vectorized; - - public TestSparkReadProjection(String format, boolean vectorized) { - super(format); - this.format = FileFormat.fromString(format); - this.vectorized = vectorized; - } - - @BeforeClass - public static void startSpark() { - TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false"); - spark - .conf() - .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach( - (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkReadProjection.spark; - TestSparkReadProjection.spark = null; - currentSpark.stop(); - } - - @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) - throws IOException { - File parent = temp.newFolder(desc); - File location = new File(parent, "test"); - File dataFolder = new File(location, "data"); - Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - - File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString())); - - Table table = TestTables.create(location, desc, writeSchema, PartitionSpec.unpartitioned()); - try { - // Important: use the table's schema for the rest of the test - // When tables are created, the column ids are reassigned. - Schema tableSchema = table.schema(); - - try (FileAppender writer = - new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { - writer.add(record); - } - - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); - - table.newAppend().appendFile(file).commit(); - - // rewrite the read schema for the table's reassigned ids - Map idMapping = Maps.newHashMap(); - for (int id : allIds(writeSchema)) { - // translate each id to the original schema's column name, then to the new schema's id - String originalName = writeSchema.findColumnName(id); - idMapping.put(id, tableSchema.findField(originalName).fieldId()); - } - Schema expectedSchema = reassignIds(readSchema, idMapping); - - // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata( - desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - - Dataset df = - spark - .read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); - - return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); - - } finally { - TestTables.clearTables(); - } - } - - private List allIds(Schema schema) { - List ids = Lists.newArrayList(); - TypeUtil.visit( - schema, - new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } - - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } - - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); - return ids; - } - - private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema( - TypeUtil.visit( - schema, - new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } - - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } - - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = - Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add( - optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add( - required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } - - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } - - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } - - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } - - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }) - .asNestedType() - .asStructType() - .fields()); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java deleted file mode 100644 index 462f34530725..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - -import java.io.IOException; -import java.util.List; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.CatalogUtil; -import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.Files; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.catalog.Namespace; -import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.data.DeleteReadTests; -import org.apache.iceberg.data.FileHelpers; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.hive.HiveCatalog; -import org.apache.iceberg.hive.TestHiveMetastore; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkStructLike; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.StructLikeSet; -import org.apache.iceberg.util.TableScanUtil; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.internal.SQLConf; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Test; - -public class TestSparkReaderDeletes extends DeleteReadTests { - - private static TestHiveMetastore metastore = null; - protected static SparkSession spark = null; - protected static HiveCatalog catalog = null; - - @BeforeClass - public static void startMetastoreAndSpark() { - metastore = new TestHiveMetastore(); - metastore.start(); - HiveConf hiveConf = metastore.hiveConf(); - - spark = - SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); - - catalog = - (HiveCatalog) - CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); - - try { - catalog.createNamespace(Namespace.of("default")); - } catch (AlreadyExistsException ignored) { - // the default namespace already exists. ignore the create error - } - } - - @AfterClass - public static void stopMetastoreAndSpark() throws Exception { - catalog = null; - metastore.stop(); - metastore = null; - spark.stop(); - spark = null; - } - - @Override - protected Table createTable(String name, Schema schema, PartitionSpec spec) { - Table table = catalog.createTable(TableIdentifier.of("default", name), schema); - TableOperations ops = ((BaseTable) table).operations(); - TableMetadata meta = ops.current(); - ops.commit(meta, meta.upgradeToFormatVersion(2)); - - return table; - } - - @Override - protected void dropTable(String name) { - catalog.dropTable(TableIdentifier.of("default", name)); - } - - @Override - public StructLikeSet rowSet(String name, Table table, String... columns) { - Dataset df = - spark - .read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); - - Types.StructType projection = table.schema().select(columns).asStruct(); - StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList() - .forEach( - row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); - - return set; - } - - @Test - public void testEqualityDeleteWithFilter() throws IOException { - String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); - Schema deleteRowSchema = table.schema().select("data"); - Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = - Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(temp.newFile()), - TestHelpers.Row.of(0), - dataDeletes, - deleteRowSchema); - - table.newRowDelta().addDeletes(eqDeletes).commit(); - - Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = - spark - .read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); - - StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList() - .forEach( - row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); - - Assert.assertEquals("Table should contain no rows", 0, actual.size()); - } - - @Test - public void testReadEqualityDeleteRows() throws IOException { - Schema deleteSchema1 = table.schema().select("data"); - Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = - Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); - - Schema deleteSchema2 = table.schema().select("id"); - Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = - Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(temp.newFile()), - TestHelpers.Row.of(0), - dataDeletes, - deleteSchema1); - - DeleteFile eqDelete2 = - FileHelpers.writeDeleteFile( - table, - Files.localOutput(temp.newFile()), - TestHelpers.Row.of(0), - idDeletes, - deleteSchema2); - - table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); - - StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); - - Types.StructType type = table.schema().asStruct(); - StructLikeSet actualRowSet = StructLikeSet.create(type); - - CloseableIterable tasks = - TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); - - for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = - new EqualityDeleteRowReader(task, table, table.schema(), false)) { - while (reader.next()) { - actualRowSet.add( - new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) - .wrap(reader.get().copy())); - } - } - } - - Assert.assertEquals("should include 4 deleted row", 4, actualRowSet.size()); - Assert.assertEquals("deleted row should be matched", expectedRowSet, actualRowSet); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java deleted file mode 100644 index dcf9140a8885..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.List; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestRollingFileWriters; -import org.apache.iceberg.util.ArrayUtil; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkRollingFileWriters extends TestRollingFileWriters { - - public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { - super(fileFormat, partitioned); - } - - @Override - protected FileWriterFactory newWriterFactory( - Schema dataSchema, - List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { - return SparkFileWriterFactory.builderFor(table) - .dataSchema(table.schema()) - .dataFileFormat(format()) - .deleteFileFormat(format()) - .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) - .equalityDeleteRowSchema(equalityDeleteRowSchema) - .positionDeleteRowSchema(positionDeleteRowSchema) - .build(); - } - - @Override - protected InternalRow toRow(Integer id, String data) { - InternalRow row = new GenericInternalRow(2); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - return row; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java deleted file mode 100644 index 93cc24973f6c..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.IOException; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkSchema { - - private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static SparkSession spark = null; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @BeforeClass - public static void startSpark() { - TestSparkSchema.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkSchema.spark; - TestSparkSchema.spark = null; - currentSpark.stop(); - } - - @Test - public void testSparkReadSchemaIsHonored() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, null, tableLocation); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - StructType sparkReadSchema = - new StructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) - }); - - Dataset resultDf = - spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation); - - Row[] results = (Row[]) resultDf.collect(); - - Assert.assertEquals("Result size matches", 1, results.length); - Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); - Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); - } - - @Test - public void testFailIfSparkReadSchemaIsOff() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, null, tableLocation); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - StructType sparkReadSchema = - new StructType( - new StructField[] { - new StructField( - "idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name - }); - - AssertHelpers.assertThrows( - "Iceberg should not allow a projection that contain unknown fields", - java.lang.IllegalArgumentException.class, - "Field idd not found in source schema", - () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation)); - } - - @Test - public void testSparkReadSchemaCombinedWithProjection() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, null, tableLocation); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - StructType sparkReadSchema = - new StructType( - new StructField[] { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("data", DataTypes.StringType, true, Metadata.empty()) - }); - - Dataset resultDf = - spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id"); - - Row[] results = (Row[]) resultDf.collect(); - - Assert.assertEquals("Result size matches", 1, results.length); - Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); - Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); - } - - @Test - public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() - throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - tables.create(SCHEMA, spec, null, tableLocation); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); - - StructType sparkReadSchema = - new StructType( - new StructField[] { - new StructField("data", DataTypes.StringType, true, Metadata.empty()) - }); - - AssertHelpers.assertThrows( - "Spark should not allow a projection that is not included in the read schema", - org.apache.spark.sql.AnalysisException.class, - "cannot resolve '`id`' given input columns: [data]", - () -> - spark - .read() - .schema(sparkReadSchema) - .format("iceberg") - .load(tableLocation) - .select("id")); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java deleted file mode 100644 index f9d4329f9ed8..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java +++ /dev/null @@ -1,571 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.KryoHelpers; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.hive.HiveTableBaseTest; -import org.apache.iceberg.mapping.MappingUtil; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.SparkSchemaUtil; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkTableUtil.SparkPartition; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.TableIdentifier; -import org.assertj.core.api.Assertions; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Assume; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.runners.Enclosed; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Enclosed.class) -public class TestSparkTableUtil extends HiveTableBaseTest { - private static final String TABLE_NAME = "hive_table"; - private static final String QUALIFIED_TABLE_NAME = - String.format("%s.%s", HiveTableBaseTest.DB_NAME, TABLE_NAME); - private static final Path TABLE_LOCATION_PATH = - HiveTableBaseTest.getTableLocationPath(TABLE_NAME); - private static final String TABLE_LOCATION_STR = TABLE_LOCATION_PATH.toString(); - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - String metastoreURI = HiveTableBaseTest.hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname); - - // Create a spark session. - TestSparkTableUtil.spark = - SparkSession.builder() - .master("local[2]") - .enableHiveSupport() - .config("spark.hadoop.hive.metastore.uris", metastoreURI) - .config("hive.exec.dynamic.partition", "true") - .config("hive.exec.dynamic.partition.mode", "nonstrict") - .config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true") - .getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkTableUtil.spark; - // Stop the spark session. - TestSparkTableUtil.spark = null; - currentSpark.stop(); - } - - static void loadData(FileFormat fileFormat) { - // Create a hive table. - SQLContext sc = new SQLContext(TestSparkTableUtil.spark); - - sc.sql( - String.format( - "CREATE TABLE %s (\n" - + " id int COMMENT 'unique id'\n" - + ")\n" - + "PARTITIONED BY (data string)\n" - + "STORED AS %s\n" - + "LOCATION '%s'", - QUALIFIED_TABLE_NAME, fileFormat, TABLE_LOCATION_STR)); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - - df.select("id", "data").orderBy("data").write().mode("append").insertInto(QUALIFIED_TABLE_NAME); - } - - static void cleanupData() throws IOException { - // Drop the hive table. - SQLContext sc = new SQLContext(TestSparkTableUtil.spark); - sc.sql(String.format("DROP TABLE IF EXISTS %s", QUALIFIED_TABLE_NAME)); - - // Delete the data corresponding to the table. - TABLE_LOCATION_PATH.getFileSystem(HiveTableBaseTest.hiveConf).delete(TABLE_LOCATION_PATH, true); - } - - @RunWith(Parameterized.class) - public static class TableImport { - - private final FileFormat format; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @Parameterized.Parameters(name = "format = {0}") - public static Object[] parameters() { - return new Object[] {"parquet", "orc"}; - } - - public TableImport(String format) { - this.format = FileFormat.fromString(format); - } - - @Before - public void before() { - loadData(format); - } - - @After - public void after() throws IOException { - cleanupData(); - } - - @Test - public void testImportPartitionedTable() throws Exception { - File location = temp.newFolder("partitioned_table"); - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .partitionBy("data") - .format(format.toString()) - .saveAsTable("test_partitioned_table"); - TableIdentifier source = - spark.sessionState().sqlParser().parseTableIdentifier("test_partitioned_table"); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Schema tableSchema = SparkSchemaUtil.schemaForTable(spark, QUALIFIED_TABLE_NAME); - Table table = - tables.create( - tableSchema, - SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), - ImmutableMap.of(), - location.getCanonicalPath()); - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - long count = spark.read().format("iceberg").load(location.toString()).count(); - Assert.assertEquals("three values ", 3, count); - } - - @Test - public void testImportUnpartitionedTable() throws Exception { - File location = temp.newFolder("unpartitioned_table"); - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .format(format.toString()) - .saveAsTable("test_unpartitioned_table"); - TableIdentifier source = - spark.sessionState().sqlParser().parseTableIdentifier("test_unpartitioned_table"); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = - tables.create( - SparkSchemaUtil.schemaForTable(spark, QUALIFIED_TABLE_NAME), - SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), - ImmutableMap.of(), - location.getCanonicalPath()); - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - long count = spark.read().format("iceberg").load(location.toString()).count(); - Assert.assertEquals("three values ", 3, count); - } - - @Test - public void testImportAsHiveTable() throws Exception { - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .format(format.toString()) - .saveAsTable("unpartitioned_table"); - TableIdentifier source = new TableIdentifier("unpartitioned_table"); - org.apache.iceberg.catalog.TableIdentifier testUnpartitionedTableId = - org.apache.iceberg.catalog.TableIdentifier.of( - DB_NAME, "test_unpartitioned_table_" + format); - File stagingDir = temp.newFolder("staging-dir"); - Table table = - catalog.createTable( - testUnpartitionedTableId, - SparkSchemaUtil.schemaForTable(spark, "unpartitioned_table"), - SparkSchemaUtil.specForTable(spark, "unpartitioned_table")); - - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - long count1 = - spark.read().format("iceberg").load(testUnpartitionedTableId.toString()).count(); - Assert.assertEquals("three values ", 3, count1); - - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .partitionBy("data") - .format(format.toString()) - .saveAsTable("partitioned_table"); - - source = new TableIdentifier("partitioned_table"); - org.apache.iceberg.catalog.TableIdentifier testPartitionedTableId = - org.apache.iceberg.catalog.TableIdentifier.of( - DB_NAME, "test_partitioned_table_" + format); - table = - catalog.createTable( - testPartitionedTableId, - SparkSchemaUtil.schemaForTable(spark, "partitioned_table"), - SparkSchemaUtil.specForTable(spark, "partitioned_table")); - - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - long count2 = spark.read().format("iceberg").load(testPartitionedTableId.toString()).count(); - Assert.assertEquals("three values ", 3, count2); - } - - @Test - public void testImportWithNameMapping() throws Exception { - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .format(format.toString()) - .saveAsTable("original_table"); - - // The field is different so that it will project with name mapping - Schema filteredSchema = new Schema(optional(1, "data", Types.StringType.get())); - - NameMapping nameMapping = MappingUtil.create(filteredSchema); - - String targetTableName = "target_table_" + format; - TableIdentifier source = new TableIdentifier("original_table"); - org.apache.iceberg.catalog.TableIdentifier targetTable = - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, targetTableName); - Table table = - catalog.createTable( - targetTable, filteredSchema, SparkSchemaUtil.specForTable(spark, "original_table")); - - table - .updateProperties() - .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) - .commit(); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - - // The filter invoke the metric/dictionary row group filter in which it project schema - // with name mapping again to match the metric read from footer. - List actual = - spark - .read() - .format("iceberg") - .load(targetTable.toString()) - .select("data") - .sort("data") - .filter("data >= 'b'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = Lists.newArrayList("b", "c"); - - Assert.assertEquals(expected, actual); - } - - @Test - public void testImportWithNameMappingForVectorizedParquetReader() throws Exception { - Assume.assumeTrue("Applies only to parquet format.", FileFormat.PARQUET == format); - spark - .table(QUALIFIED_TABLE_NAME) - .write() - .mode("overwrite") - .format(format.toString()) - .saveAsTable("original_table"); - - // The field is different so that it will project with name mapping - Schema filteredSchema = new Schema(optional(1, "data", Types.StringType.get())); - - NameMapping nameMapping = MappingUtil.create(filteredSchema); - - TableIdentifier source = new TableIdentifier("original_table"); - Table table = - catalog.createTable( - org.apache.iceberg.catalog.TableIdentifier.of( - DB_NAME, "target_table_for_vectorization"), - filteredSchema, - SparkSchemaUtil.specForTable(spark, "original_table")); - - table - .updateProperties() - .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) - .set(PARQUET_VECTORIZATION_ENABLED, "true") - .commit(); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - - // The filter invoke the metric/dictionary row group filter in which it project schema - // with name mapping again to match the metric read from footer. - List actual = - spark - .read() - .format("iceberg") - .load(DB_NAME + ".target_table_for_vectorization") - .select("data") - .sort("data") - .filter("data >= 'b'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = Lists.newArrayList("b", "c"); - - Assert.assertEquals(expected, actual); - } - - @Test - public void testImportPartitionedWithWhitespace() throws Exception { - String partitionCol = "dAtA sPaced"; - String spacedTableName = "whitespacetable"; - String whiteSpaceKey = "some key value"; - - List spacedRecords = Lists.newArrayList(new SimpleRecord(1, whiteSpaceKey)); - - File icebergLocation = temp.newFolder("partitioned_table"); - - spark - .createDataFrame(spacedRecords, SimpleRecord.class) - .withColumnRenamed("data", partitionCol) - .write() - .mode("overwrite") - .partitionBy(partitionCol) - .format(format.toString()) - .saveAsTable(spacedTableName); - - TableIdentifier source = - spark.sessionState().sqlParser().parseTableIdentifier(spacedTableName); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = - tables.create( - SparkSchemaUtil.schemaForTable(spark, spacedTableName), - SparkSchemaUtil.specForTable(spark, spacedTableName), - ImmutableMap.of(), - icebergLocation.getCanonicalPath()); - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - List results = - spark - .read() - .format("iceberg") - .load(icebergLocation.toString()) - .withColumnRenamed(partitionCol, "data") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Data should match", spacedRecords, results); - } - - @Test - public void testImportUnpartitionedWithWhitespace() throws Exception { - String spacedTableName = "whitespacetable_" + format; - String whiteSpaceKey = "some key value"; - - List spacedRecords = Lists.newArrayList(new SimpleRecord(1, whiteSpaceKey)); - - File whiteSpaceOldLocation = temp.newFolder("white space location"); - File icebergLocation = temp.newFolder("partitioned_table"); - - spark - .createDataFrame(spacedRecords, SimpleRecord.class) - .write() - .mode("overwrite") - .format(format.toString()) - .save(whiteSpaceOldLocation.getPath()); - - spark - .catalog() - .createExternalTable(spacedTableName, whiteSpaceOldLocation.getPath(), format.toString()); - - TableIdentifier source = - spark.sessionState().sqlParser().parseTableIdentifier(spacedTableName); - HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = - tables.create( - SparkSchemaUtil.schemaForTable(spark, spacedTableName), - SparkSchemaUtil.specForTable(spark, spacedTableName), - ImmutableMap.of(), - icebergLocation.getCanonicalPath()); - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - List results = - spark - .read() - .format("iceberg") - .load(icebergLocation.toString()) - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Data should match", spacedRecords, results); - } - - @Test - public void testSparkPartitionKryoSerialization() throws IOException { - Map values = ImmutableMap.of("id", "2"); - String uri = "s3://bucket/table/data/id=2"; - SparkPartition sparkPartition = new SparkPartition(values, uri, format.toString()); - - SparkPartition deserialized = KryoHelpers.roundTripSerialize(sparkPartition); - Assertions.assertThat(sparkPartition).isEqualTo(deserialized); - } - - @Test - public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFoundException { - Map values = ImmutableMap.of("id", "2"); - String uri = "s3://bucket/table/data/id=2"; - SparkPartition sparkPartition = new SparkPartition(values, uri, format.toString()); - - SparkPartition deserialized = TestHelpers.roundTripSerialize(sparkPartition); - Assertions.assertThat(sparkPartition).isEqualTo(deserialized); - } - } - - public static class GetPartitions { - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - // This logic does not really depend on format - private final FileFormat format = FileFormat.PARQUET; - - @Test - public void testPartitionScan() throws Exception { - - List records = - Lists.newArrayList( - new ThreeColumnRecord(1, "ab", "data"), - new ThreeColumnRecord(2, "b c", "data"), - new ThreeColumnRecord(1, "b c", "data"), - new ThreeColumnRecord(2, "ab", "data")); - - String tableName = "external_table"; - - spark - .createDataFrame(records, ThreeColumnRecord.class) - .write() - .mode("overwrite") - .format(format.toString()) - .partitionBy("c1", "c2") - .saveAsTable(tableName); - - TableIdentifier source = spark.sessionState().sqlParser().parseTableIdentifier(tableName); - - Map partition1 = - ImmutableMap.of( - "c1", "1", - "c2", "ab"); - Map partition2 = - ImmutableMap.of( - "c1", "2", - "c2", "b c"); - Map partition3 = - ImmutableMap.of( - "c1", "1", - "c2", "b c"); - Map partition4 = - ImmutableMap.of( - "c1", "2", - "c2", "ab"); - - List partitionsC11 = - SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c1", "1")); - Set> expectedC11 = Sets.newHashSet(partition1, partition3); - Set> actualC11 = - partitionsC11.stream().map(p -> p.getValues()).collect(Collectors.toSet()); - Assert.assertEquals("Wrong partitions fetched for c1=1", expectedC11, actualC11); - - List partitionsC12 = - SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c1", "2")); - Set> expectedC12 = Sets.newHashSet(partition2, partition4); - Set> actualC12 = - partitionsC12.stream().map(p -> p.getValues()).collect(Collectors.toSet()); - Assert.assertEquals("Wrong partitions fetched for c1=2", expectedC12, actualC12); - - List partitionsC21 = - SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c2", "ab")); - Set> expectedC21 = Sets.newHashSet(partition1, partition4); - Set> actualC21 = - partitionsC21.stream().map(p -> p.getValues()).collect(Collectors.toSet()); - Assert.assertEquals("Wrong partitions fetched for c2=ab", expectedC21, actualC21); - - List partitionsC22 = - SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c2", "b c")); - Set> expectedC22 = Sets.newHashSet(partition2, partition3); - Set> actualC22 = - partitionsC22.stream().map(p -> p.getValues()).collect(Collectors.toSet()); - Assert.assertEquals("Wrong partitions fetched for c2=b c", expectedC22, actualC22); - } - } - - public static class PartitionScan { - - @Before - public void before() { - loadData(FileFormat.PARQUET); - } - - @After - public void after() throws IOException { - cleanupData(); - } - - @Test - public void testPartitionScan() { - List partitions = SparkTableUtil.getPartitions(spark, QUALIFIED_TABLE_NAME); - Assert.assertEquals("There should be 3 partitions", 3, partitions.size()); - - Dataset partitionDF = SparkTableUtil.partitionDF(spark, QUALIFIED_TABLE_NAME); - Assert.assertEquals("There should be 3 partitions", 3, partitionDF.count()); - } - - @Test - public void testPartitionScanByFilter() { - List partitions = - SparkTableUtil.getPartitionsByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); - Assert.assertEquals("There should be 1 matching partition", 1, partitions.size()); - - Dataset partitionDF = - SparkTableUtil.partitionDFByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); - Assert.assertEquals("There should be 1 matching partition", 1, partitionDF.count()); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java deleted file mode 100644 index 734dddc5a75e..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java +++ /dev/null @@ -1,580 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.mapping.MappingUtil; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkTableUtil; -import org.apache.iceberg.spark.SparkTableUtil.SparkPartition; -import org.apache.iceberg.types.Conversions; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Column; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.TableIdentifier; -import org.apache.spark.sql.functions; -import org.apache.spark.sql.internal.SQLConf; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestSparkTableUtilWithInMemoryCatalog { - - private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final PartitionSpec SPEC = - PartitionSpec.builderFor(SCHEMA).identity("data").build(); - - private static SparkSession spark; - - @BeforeClass - public static void startSpark() { - TestSparkTableUtilWithInMemoryCatalog.spark = - SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestSparkTableUtilWithInMemoryCatalog.spark; - TestSparkTableUtilWithInMemoryCatalog.spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private String tableLocation = null; - - @Before - public void setupTableLocation() throws Exception { - File tableDir = temp.newFolder(); - this.tableLocation = tableDir.toURI().toString(); - } - - @Test - public void testImportUnpartitionedTable() throws IOException { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full"); - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), props, tableLocation); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class).coalesce(1); - inputDF - .select("id", "data") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .saveAsTable("parquet_table"); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - List actualRecords = - spark - .read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Result rows should match", records, actualRecords); - - Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - Types.NestedField idField = table.schema().findField("id"); - checkFieldMetrics(fileDF, idField, true); - Types.NestedField dataField = table.schema().findField("data"); - checkFieldMetrics(fileDF, dataField, false); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportPartitionedTable() throws IOException { - Map props = Maps.newHashMap(); - props.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full"); - Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF - .select("id", "data") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .partitionBy("data") - .saveAsTable("parquet_table"); - - Assert.assertEquals( - "Should have 3 partitions", - 3, - SparkTableUtil.getPartitions(spark, "parquet_table").size()); - - Assert.assertEquals( - "Should have 1 partition where data = 'a'", - 1, - SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'").size()); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - List actualRecords = - spark - .read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Result rows should match", records, actualRecords); - - Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - Types.NestedField idField = table.schema().findField("id"); - checkFieldMetrics(fileDF, idField, true); - // 'data' is a partition column and is not physically present in files written by Spark - Types.NestedField dataField = table.schema().findField("data"); - checkFieldMetrics(fileDF, dataField, true); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportPartitions() throws IOException { - Table table = TABLES.create(SCHEMA, SPEC, tableLocation); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF - .select("id", "data") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .partitionBy("data") - .saveAsTable("parquet_table"); - - File stagingDir = temp.newFolder("staging-dir"); - List partitions = - SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); - SparkTableUtil.importSparkPartitions( - spark, partitions, table, table.spec(), stagingDir.toString()); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - - List actualRecords = - spark - .read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Result rows should match", expectedRecords, actualRecords); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportPartitionsWithSnapshotInheritance() throws IOException { - Table table = TABLES.create(SCHEMA, SPEC, tableLocation); - - table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - - List records = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF - .select("id", "data") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .partitionBy("data") - .saveAsTable("parquet_table"); - - File stagingDir = temp.newFolder("staging-dir"); - List partitions = - SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); - SparkTableUtil.importSparkPartitions( - spark, partitions, table, table.spec(), stagingDir.toString()); - - List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - - List actualRecords = - spark - .read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - - Assert.assertEquals("Result rows should match", expectedRecords, actualRecords); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportTableWithMappingForNestedData() throws IOException { - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset df1 = - spark - .range(1, 2) - .withColumn("extra_col", functions.lit(-1)) - .withColumn( - "struct", - functions.expr( - "named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")); - Dataset df2 = - spark - .range(2, 3) - .withColumn("extra_col", functions.lit(-1)) - .withColumn( - "struct", - functions.expr( - "named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")); - df1.union(df2) - .coalesce(1) - .select("id", "extra_col", "struct") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .saveAsTable("parquet_table"); - - // don't include `extra_col` and `nested_2` on purpose - Schema schema = - new Schema( - optional(1, "id", Types.LongType.get()), - required( - 2, - "struct", - Types.StructType.of( - required(3, "nested_1", Types.StringType.get()), - required(4, "nested_3", Types.StringType.get())))); - Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation); - - // assign a custom metrics config and a name mapping - NameMapping nameMapping = MappingUtil.create(schema); - table - .updateProperties() - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") - .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full") - .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full") - .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) - .commit(); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - // validate we get the expected results back - List expected = - spark - .table("parquet_table") - .select("id", "struct.nested_1", "struct.nested_3") - .collectAsList(); - List actual = - spark - .read() - .format("iceberg") - .load(tableLocation) - .select("id", "struct.nested_1", "struct.nested_3") - .collectAsList(); - Assert.assertEquals("Rows must match", expected, actual); - - // validate we persisted correct metrics - Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - - List bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList(); - Assert.assertEquals( - "Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); - Assert.assertEquals( - "Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); - - Types.NestedField nestedField1 = table.schema().findField("struct.nested_1"); - checkFieldMetrics(fileDF, nestedField1, true); - - Types.NestedField id = table.schema().findField("id"); - checkFieldMetrics(fileDF, id, 1L, 2L); - - Types.NestedField nestedField3 = table.schema().findField("struct.nested_3"); - checkFieldMetrics(fileDF, nestedField3, "f", "g"); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOException { - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - Dataset df1 = - spark - .range(1, 2) - .withColumn("extra_col", functions.lit(-1)) - .withColumn( - "struct", - functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")) - .withColumn("data", functions.lit("Z")); - Dataset df2 = - spark - .range(2, 3) - .withColumn("extra_col", functions.lit(-1)) - .withColumn( - "struct", - functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")) - .withColumn("data", functions.lit("Z")); - df1.union(df2) - .coalesce(1) - .select("id", "extra_col", "struct", "data") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .partitionBy("data") - .saveAsTable("parquet_table"); - - // don't include `extra_col` and `nested_2` on purpose - Schema schema = - new Schema( - optional(1, "id", Types.LongType.get()), - required( - 2, - "struct", - Types.StructType.of( - required(4, "nested_1", Types.StringType.get()), - required(5, "nested_3", Types.StringType.get()))), - required(3, "data", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - Table table = TABLES.create(schema, spec, tableLocation); - - // assign a custom metrics config and a name mapping - NameMapping nameMapping = MappingUtil.create(schema); - table - .updateProperties() - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") - .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full") - .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full") - .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) - .commit(); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - // validate we get the expected results back - List expected = - spark - .table("parquet_table") - .select("id", "struct.nested_1", "struct.nested_3", "data") - .collectAsList(); - List actual = - spark - .read() - .format("iceberg") - .load(tableLocation) - .select("id", "struct.nested_1", "struct.nested_3", "data") - .collectAsList(); - Assert.assertEquals("Rows must match", expected, actual); - - // validate we persisted correct metrics - Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - - List bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList(); - Assert.assertEquals( - "Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); - Assert.assertEquals( - "Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); - - Types.NestedField nestedField1 = table.schema().findField("struct.nested_1"); - checkFieldMetrics(fileDF, nestedField1, true); - - Types.NestedField id = table.schema().findField("id"); - checkFieldMetrics(fileDF, id, 1L, 2L); - - Types.NestedField nestedField3 = table.schema().findField("struct.nested_3"); - checkFieldMetrics(fileDF, nestedField3, "f", "g"); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - @Test - public void testImportTableWithInt96Timestamp() throws IOException { - File parquetTableDir = temp.newFolder("parquet_table"); - String parquetTableLocation = parquetTableDir.toURI().toString(); - - try { - spark.conf().set(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE().key(), "INT96"); - - Column timestampColumn = functions.to_timestamp(functions.lit("2010-03-20 10:40:30.1234")); - Dataset df = spark.range(1, 10).withColumn("tmp_col", timestampColumn); - df.coalesce(1) - .select("id", "tmp_col") - .write() - .format("parquet") - .mode("append") - .option("path", parquetTableLocation) - .saveAsTable("parquet_table"); - - Schema schema = - new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "tmp_col", Types.TimestampType.withZone())); - Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation); - - // assign a custom metrics config and disable vectorized reads - table - .updateProperties() - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") - .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false") - .commit(); - - File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable( - spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - // validate we get the expected results back - List expected = spark.table("parquet_table").select("id", "tmp_col").collectAsList(); - List actual = - spark - .read() - .format("iceberg") - .load(tableLocation) - .select("id", "tmp_col") - .collectAsList(); - Assert.assertEquals("Rows must match", expected, actual); - - // validate we did not persist metrics for INT96 - Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); - - Types.NestedField timestampField = table.schema().findField("tmp_col"); - checkFieldMetrics(fileDF, timestampField, true); - - Types.NestedField idField = table.schema().findField("id"); - checkFieldMetrics(fileDF, idField, 1L, 9L); - } finally { - spark.sql("DROP TABLE parquet_table"); - } - } - - private void checkFieldMetrics( - Dataset fileDF, Types.NestedField field, Object min, Object max) { - List metricRows = - fileDF - .selectExpr( - String.format("lower_bounds['%d']", field.fieldId()), - String.format("upper_bounds['%d']", field.fieldId())) - .collectAsList(); - - // we compare string representations not to deal with HeapCharBuffers for strings - Object actualMin = - Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(0))); - Assert.assertEquals("Min value should match", min.toString(), actualMin.toString()); - Object actualMax = - Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(1))); - Assert.assertEquals("Max value should match", max.toString(), actualMax.toString()); - } - - private void checkFieldMetrics(Dataset fileDF, Types.NestedField field, boolean isNull) { - List metricRows = - fileDF - .selectExpr( - String.format("lower_bounds['%d']", field.fieldId()), - String.format("upper_bounds['%d']", field.fieldId())) - .collectAsList(); - - metricRows.forEach( - row -> { - Assert.assertEquals( - "Invalid metrics for column: " + field.name(), isNull, row.isNullAt(0)); - Assert.assertEquals( - "Invalid metrics for column: " + field.name(), isNull, row.isNullAt(1)); - }); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java deleted file mode 100644 index 06ecc20c2fc3..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Table; -import org.apache.iceberg.io.FileWriterFactory; -import org.apache.iceberg.io.TestWriterMetrics; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.unsafe.types.UTF8String; - -public class TestSparkWriterMetrics extends TestWriterMetrics { - - public TestSparkWriterMetrics(FileFormat fileFormat) { - super(fileFormat); - } - - @Override - protected FileWriterFactory newWriterFactory(Table sourceTable) { - return SparkFileWriterFactory.builderFor(sourceTable) - .dataSchema(sourceTable.schema()) - .dataFileFormat(fileFormat) - .deleteFileFormat(fileFormat) - .positionDeleteRowSchema(sourceTable.schema()) - .build(); - } - - @Override - protected InternalRow toRow(Integer id, String data, boolean boolValue, Long longValue) { - InternalRow row = new GenericInternalRow(3); - row.update(0, id); - row.update(1, UTF8String.fromString(data)); - - InternalRow nested = new GenericInternalRow(2); - nested.update(0, boolValue); - nested.update(1, longValue); - - row.update(2, nested); - return row; - } - - @Override - protected InternalRow toGenericRow(int value, int repeated) { - InternalRow row = new GenericInternalRow(repeated); - for (int i = 0; i < repeated; i++) { - row.update(i, value); - } - return row; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java deleted file mode 100644 index a350bc3a44b8..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import com.fasterxml.jackson.databind.node.ObjectNode; -import java.util.Arrays; -import org.apache.iceberg.util.JsonUtil; -import org.junit.Assert; -import org.junit.Test; - -public class TestStreamingOffset { - - @Test - public void testJsonConversion() { - org.apache.iceberg.spark.source.StreamingOffset[] expected = - new org.apache.iceberg.spark.source.StreamingOffset[] { - new org.apache.iceberg.spark.source.StreamingOffset( - System.currentTimeMillis(), 1L, false), - new org.apache.iceberg.spark.source.StreamingOffset( - System.currentTimeMillis(), 2L, false), - new org.apache.iceberg.spark.source.StreamingOffset( - System.currentTimeMillis(), 3L, false), - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals( - "StreamingOffsets should match", - expected, - Arrays.stream(expected) - .map(elem -> org.apache.iceberg.spark.source.StreamingOffset.fromJson(elem.json())) - .toArray()); - } - - @Test - public void testToJson() throws Exception { - org.apache.iceberg.spark.source.StreamingOffset expected = - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 1L, false); - ObjectNode actual = JsonUtil.mapper().createObjectNode(); - actual.put("version", 1); - actual.put("snapshot_id", expected.snapshotId()); - actual.put("position", 1L); - actual.put("scan_all_files", false); - String expectedJson = expected.json(); - String actualJson = JsonUtil.mapper().writeValueAsString(actual); - Assert.assertEquals("Json should match", expectedJson, actualJson); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java deleted file mode 100644 index 7493a87b87c1..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.types.Types.NestedField.optional; - -import java.io.File; -import java.util.List; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.execution.streaming.MemoryStream; -import org.apache.spark.sql.streaming.DataStreamWriter; -import org.apache.spark.sql.streaming.StreamingQuery; -import org.apache.spark.sql.streaming.StreamingQueryException; -import org.assertj.core.api.Assertions; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import scala.collection.JavaConversions; - -public class TestStructuredStreaming { - - private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static SparkSession spark = null; - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - @BeforeClass - public static void startSpark() { - TestStructuredStreaming.spark = - SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestStructuredStreaming.spark; - TestStructuredStreaming.spark = null; - currentSpark.stop(); - } - - @Test - public void testStreamingWriteAppendMode() throws Exception { - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test-table"); - File checkpoint = new File(parent, "checkpoint"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4")); - - MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = - inputStream - .toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); - - try { - // start the original query with checkpointing - StreamingQuery query = streamWriter.start(); - List batch1 = Lists.newArrayList(1, 2); - send(batch1, inputStream); - query.processAllAvailable(); - List batch2 = Lists.newArrayList(3, 4); - send(batch2, inputStream); - query.processAllAvailable(); - query.stop(); - - // remove the last commit to force Spark to reprocess batch #1 - File lastCommitFile = new File(checkpoint.toString() + "/commits/1"); - Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); - - // restart the query from the checkpoint - StreamingQuery restartedQuery = streamWriter.start(); - restartedQuery.processAllAvailable(); - - // ensure the write was idempotent - Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); - } finally { - for (StreamingQuery query : spark.streams().active()) { - query.stop(); - } - } - } - - @Test - public void testStreamingWriteCompleteMode() throws Exception { - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test-table"); - File checkpoint = new File(parent, "checkpoint"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); - - MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = - inputStream - .toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); - - try { - // start the original query with checkpointing - StreamingQuery query = streamWriter.start(); - List batch1 = Lists.newArrayList(1, 2); - send(batch1, inputStream); - query.processAllAvailable(); - List batch2 = Lists.newArrayList(1, 2, 2, 3); - send(batch2, inputStream); - query.processAllAvailable(); - query.stop(); - - // remove the last commit to force Spark to reprocess batch #1 - File lastCommitFile = new File(checkpoint.toString() + "/commits/1"); - Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); - - // restart the query from the checkpoint - StreamingQuery restartedQuery = streamWriter.start(); - restartedQuery.processAllAvailable(); - - // ensure the write was idempotent - Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = - result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); - } finally { - for (StreamingQuery query : spark.streams().active()) { - query.stop(); - } - } - } - - @Test - public void testStreamingWriteCompleteModeWithProjection() throws Exception { - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test-table"); - File checkpoint = new File(parent, "checkpoint"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = tables.create(SCHEMA, spec, location.toString()); - - List expected = - Lists.newArrayList( - new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); - - MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = - inputStream - .toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); - - try { - // start the original query with checkpointing - StreamingQuery query = streamWriter.start(); - List batch1 = Lists.newArrayList(1, 2); - send(batch1, inputStream); - query.processAllAvailable(); - List batch2 = Lists.newArrayList(1, 2, 2, 3); - send(batch2, inputStream); - query.processAllAvailable(); - query.stop(); - - // remove the last commit to force Spark to reprocess batch #1 - File lastCommitFile = new File(checkpoint.toString() + "/commits/1"); - Assert.assertTrue("The commit file must be deleted", lastCommitFile.delete()); - - // restart the query from the checkpoint - StreamingQuery restartedQuery = streamWriter.start(); - restartedQuery.processAllAvailable(); - - // ensure the write was idempotent - Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = - result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); - Assert.assertEquals("Result rows should match", expected, actual); - Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); - } finally { - for (StreamingQuery query : spark.streams().active()) { - query.stop(); - } - } - } - - @Test - public void testStreamingWriteUpdateMode() throws Exception { - File parent = temp.newFolder("parquet"); - File location = new File(parent, "test-table"); - File checkpoint = new File(parent, "checkpoint"); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); - tables.create(SCHEMA, spec, location.toString()); - - MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = - inputStream - .toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); - - try { - StreamingQuery query = streamWriter.start(); - List batch1 = Lists.newArrayList(1, 2); - send(batch1, inputStream); - Assertions.assertThatThrownBy(query::processAllAvailable) - .isInstanceOf(StreamingQueryException.class) - .hasMessageContaining("Output mode Update is not supported"); - } finally { - for (StreamingQuery query : spark.streams().active()) { - query.stop(); - } - } - } - - private MemoryStream newMemoryStream(int id, SQLContext sqlContext, Encoder encoder) { - return new MemoryStream<>(id, sqlContext, encoder); - } - - private void send(List records, MemoryStream stream) { - stream.addData(JavaConversions.asScalaBuffer(records)); - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java deleted file mode 100644 index ef2f73c3803c..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.io.File; -import java.util.Map; -import org.apache.iceberg.BaseTable; -import org.apache.iceberg.Files; -import org.apache.iceberg.LocationProviders; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.TableOperations; -import org.apache.iceberg.exceptions.AlreadyExistsException; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.exceptions.RuntimeIOException; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.io.LocationProvider; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -// TODO: Use the copy of this from core. -class TestTables { - private TestTables() {} - - static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { - TestTableOperations ops = new TestTableOperations(name); - if (ops.current() != null) { - throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); - } - ops.commit( - null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); - return new TestTable(ops, name); - } - - static TestTable load(String name) { - TestTableOperations ops = new TestTableOperations(name); - if (ops.current() == null) { - return null; - } - return new TestTable(ops, name); - } - - static boolean drop(String name) { - synchronized (METADATA) { - return METADATA.remove(name) != null; - } - } - - static class TestTable extends BaseTable { - private final TestTableOperations ops; - - private TestTable(TestTableOperations ops, String name) { - super(ops, name); - this.ops = ops; - } - - @Override - public TestTableOperations operations() { - return ops; - } - } - - private static final Map METADATA = Maps.newHashMap(); - - static void clearTables() { - synchronized (METADATA) { - METADATA.clear(); - } - } - - static TableMetadata readMetadata(String tableName) { - synchronized (METADATA) { - return METADATA.get(tableName); - } - } - - static void replaceMetadata(String tableName, TableMetadata metadata) { - synchronized (METADATA) { - METADATA.put(tableName, metadata); - } - } - - static class TestTableOperations implements TableOperations { - - private final String tableName; - private TableMetadata current = null; - private long lastSnapshotId = 0; - private int failCommits = 0; - - TestTableOperations(String tableName) { - this.tableName = tableName; - refresh(); - if (current != null) { - for (Snapshot snap : current.snapshots()) { - this.lastSnapshotId = Math.max(lastSnapshotId, snap.snapshotId()); - } - } else { - this.lastSnapshotId = 0; - } - } - - void failCommits(int numFailures) { - this.failCommits = numFailures; - } - - @Override - public TableMetadata current() { - return current; - } - - @Override - public TableMetadata refresh() { - synchronized (METADATA) { - this.current = METADATA.get(tableName); - } - return current; - } - - @Override - public void commit(TableMetadata base, TableMetadata metadata) { - if (base != current) { - throw new CommitFailedException("Cannot commit changes based on stale metadata"); - } - synchronized (METADATA) { - refresh(); - if (base == current) { - if (failCommits > 0) { - this.failCommits -= 1; - throw new CommitFailedException("Injected failure"); - } - METADATA.put(tableName, metadata); - this.current = metadata; - } else { - throw new CommitFailedException( - "Commit failed: table was updated at %d", base.lastUpdatedMillis()); - } - } - } - - @Override - public FileIO io() { - return new LocalFileIO(); - } - - @Override - public LocationProvider locationProvider() { - Preconditions.checkNotNull( - current, "Current metadata should not be null when locatinProvider is called"); - return LocationProviders.locationsFor(current.location(), current.properties()); - } - - @Override - public String metadataFileLocation(String fileName) { - return new File(new File(current.location(), "metadata"), fileName).getAbsolutePath(); - } - - @Override - public long newSnapshotId() { - long nextSnapshotId = lastSnapshotId + 1; - this.lastSnapshotId = nextSnapshotId; - return nextSnapshotId; - } - } - - static class LocalFileIO implements FileIO { - - @Override - public InputFile newInputFile(String path) { - return Files.localInput(path); - } - - @Override - public OutputFile newOutputFile(String path) { - return Files.localOutput(new File(path)); - } - - @Override - public void deleteFile(String path) { - if (!new File(path).delete()) { - throw new RuntimeIOException("Failed to delete file: " + path); - } - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java deleted file mode 100644 index 053f6dbaea46..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ /dev/null @@ -1,309 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.Files.localOutput; - -import java.io.File; -import java.io.IOException; -import java.time.LocalDateTime; -import java.util.List; -import java.util.UUID; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericAppenderFactory; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.spark.SparkReadOptions; -import org.apache.iceberg.spark.SparkSQLProperties; -import org.apache.iceberg.spark.SparkTestBase; -import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.spark.data.GenericsHelpers; -import org.apache.iceberg.types.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestTimestampWithoutZone extends SparkTestBase { - private static final Configuration CONF = new Configuration(); - private static final HadoopTables TABLES = new HadoopTables(CONF); - - private static final Schema SCHEMA = - new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get())); - - private static SparkSession spark = null; - - @BeforeClass - public static void startSpark() { - TestTimestampWithoutZone.spark = SparkSession.builder().master("local[2]").getOrCreate(); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestTimestampWithoutZone.spark; - TestTimestampWithoutZone.spark = null; - currentSpark.stop(); - } - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private final String format; - private final boolean vectorized; - - @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") - public static Object[][] parameters() { - return new Object[][] { - {"parquet", false}, - {"parquet", true}, - {"avro", false} - }; - } - - public TestTimestampWithoutZone(String format, boolean vectorized) { - this.format = format; - this.vectorized = vectorized; - } - - private File parent = null; - private File unpartitioned = null; - private List records = null; - - @Before - public void writeUnpartitionedTable() throws IOException { - this.parent = temp.newFolder("TestTimestampWithoutZone"); - this.unpartitioned = new File(parent, "unpartitioned"); - File dataFolder = new File(unpartitioned, "data"); - Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs()); - - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString()); - Schema tableSchema = table.schema(); // use the table schema because ids are reassigned - - FileFormat fileFormat = FileFormat.fromString(format); - - File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString())); - - // create records using the table's schema - this.records = testRecords(tableSchema); - - try (FileAppender writer = - new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { - writer.addAll(records); - } - - DataFile file = - DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); - - table.newAppend().appendFile(file).commit(); - } - - @Test - public void testUnpartitionedTimestampWithoutZone() { - assertEqualsSafe(SCHEMA.asStruct(), records, read(unpartitioned.toString(), vectorized)); - } - - @Test - public void testUnpartitionedTimestampWithoutZoneProjection() { - Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe( - projection.asStruct(), - records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized, "id", "ts")); - } - - @Test - public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows( - String.format( - "Read operation performed on a timestamp without timezone field while " - + "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), - IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); - } - - @Test - public void testUnpartitionedTimestampWithoutZoneAppend() { - spark - .read() - .format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe( - SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); - } - - @Test - public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = - String.format( - "Write operation performed on a timestamp without timezone field while " - + "'%s' set to false should throw exception", - SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = - () -> - spark - .read() - .format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows( - errorMessage, - IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - writeOperation); - } - - @Test - public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf( - ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), - () -> { - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe( - SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); - }); - } - - private static Record projectFlat(Schema projection, Record record) { - Record result = GenericRecord.create(projection); - List fields = projection.asStruct().fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - result.set(i, record.getField(field.name())); - } - return result; - } - - public static void assertEqualsSafe( - Types.StructType struct, List expected, List actual) { - Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); - for (int i = 0; i < expected.size(); i += 1) { - GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); - } - } - - private List testRecords(Schema schema) { - return Lists.newArrayList( - record(schema, 0L, parseToLocal("2017-12-22T09:20:44.294658"), "junction"), - record(schema, 1L, parseToLocal("2017-12-22T07:15:34.582910"), "alligator"), - record(schema, 2L, parseToLocal("2017-12-22T06:02:09.243857"), "forrest"), - record(schema, 3L, parseToLocal("2017-12-22T03:10:11.134509"), "clapping"), - record(schema, 4L, parseToLocal("2017-12-22T00:34:00.184671"), "brush"), - record(schema, 5L, parseToLocal("2017-12-21T22:20:08.935889"), "trap"), - record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), - record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), - record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); - } - - private static List read(String table, boolean vectorized) { - return read(table, vectorized, "*"); - } - - private static List read( - String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = - spark - .read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); - return dataset.collectAsList(); - } - - private static LocalDateTime parseToLocal(String timestamp) { - return LocalDateTime.parse(timestamp); - } - - private static Record record(Schema schema, Object... values) { - Record rec = GenericRecord.create(schema); - for (int i = 0; i < values.length; i += 1) { - rec.set(i, values[i]); - } - return rec; - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java deleted file mode 100644 index 9bf00f1b1365..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Map; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.hadoop.HadoopTables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.spark.SparkWriteOptions; -import org.apache.iceberg.spark.data.RandomData; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.InternalRow; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; - -public class TestWriteMetricsConfig { - - private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = - new Schema( - optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema COMPLEX_SCHEMA = - new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required( - 3, - "record", - Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get())))); - - @Rule public TemporaryFolder temp = new TemporaryFolder(); - - private static SparkSession spark = null; - private static JavaSparkContext sc = null; - - @BeforeClass - public static void startSpark() { - TestWriteMetricsConfig.spark = SparkSession.builder().master("local[2]").getOrCreate(); - TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } - - @AfterClass - public static void stopSpark() { - SparkSession currentSpark = TestWriteMetricsConfig.spark; - TestWriteMetricsConfig.spark = null; - TestWriteMetricsConfig.sc = null; - currentSpark.stop(); - } - - @Test - public void testFullMetricsCollectionForParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); - Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data") - .coalesce(1) - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { - DataFile file = task.file(); - Assert.assertEquals(2, file.nullValueCounts().size()); - Assert.assertEquals(2, file.valueCounts().size()); - Assert.assertEquals(2, file.lowerBounds().size()); - Assert.assertEquals(2, file.upperBounds().size()); - } - } - - @Test - public void testCountMetricsCollectionForParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); - Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data") - .coalesce(1) - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { - DataFile file = task.file(); - Assert.assertEquals(2, file.nullValueCounts().size()); - Assert.assertEquals(2, file.valueCounts().size()); - Assert.assertTrue(file.lowerBounds().isEmpty()); - Assert.assertTrue(file.upperBounds().isEmpty()); - } - } - - @Test - public void testNoMetricsCollectionForParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data") - .coalesce(1) - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { - DataFile file = task.file(); - Assert.assertTrue(file.nullValueCounts().isEmpty()); - Assert.assertTrue(file.valueCounts().isEmpty()); - Assert.assertTrue(file.lowerBounds().isEmpty()); - Assert.assertTrue(file.upperBounds().isEmpty()); - } - } - - @Test - public void testCustomMetricCollectionForParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); - properties.put("write.metadata.metrics.column.id", "full"); - Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - - List expectedRecords = - Lists.newArrayList( - new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data") - .coalesce(1) - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - Schema schema = table.schema(); - Types.NestedField id = schema.findField("id"); - for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { - DataFile file = task.file(); - Assert.assertEquals(2, file.nullValueCounts().size()); - Assert.assertEquals(2, file.valueCounts().size()); - Assert.assertEquals(1, file.lowerBounds().size()); - Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId())); - Assert.assertEquals(1, file.upperBounds().size()); - Assert.assertTrue(file.upperBounds().containsKey(id.fieldId())); - } - } - - @Test - public void testBadCustomMetricCollectionForParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.unpartitioned(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); - properties.put("write.metadata.metrics.column.ids", "full"); - - AssertHelpers.assertThrows( - "Creating a table with invalid metrics should fail", - ValidationException.class, - null, - () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); - } - - @Test - public void testCustomMetricCollectionForNestedParquet() throws IOException { - String tableLocation = temp.newFolder("iceberg-table").toString(); - - HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); - Map properties = Maps.newHashMap(); - properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - properties.put("write.metadata.metrics.column.longCol", "counts"); - properties.put("write.metadata.metrics.column.record.id", "full"); - properties.put("write.metadata.metrics.column.record.data", "truncate(2)"); - Table table = tables.create(COMPLEX_SCHEMA, spec, properties, tableLocation); - - Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); - JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = - spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - - df.coalesce(1) - .write() - .format("iceberg") - .option(SparkWriteOptions.WRITE_FORMAT, "parquet") - .mode(SaveMode.Append) - .save(tableLocation); - - Schema schema = table.schema(); - Types.NestedField longCol = schema.findField("longCol"); - Types.NestedField recordId = schema.findField("record.id"); - Types.NestedField recordData = schema.findField("record.data"); - for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { - DataFile file = task.file(); - - Map nullValueCounts = file.nullValueCounts(); - Assert.assertEquals(3, nullValueCounts.size()); - Assert.assertTrue(nullValueCounts.containsKey(longCol.fieldId())); - Assert.assertTrue(nullValueCounts.containsKey(recordId.fieldId())); - Assert.assertTrue(nullValueCounts.containsKey(recordData.fieldId())); - - Map valueCounts = file.valueCounts(); - Assert.assertEquals(3, valueCounts.size()); - Assert.assertTrue(valueCounts.containsKey(longCol.fieldId())); - Assert.assertTrue(valueCounts.containsKey(recordId.fieldId())); - Assert.assertTrue(valueCounts.containsKey(recordData.fieldId())); - - Map lowerBounds = file.lowerBounds(); - Assert.assertEquals(2, lowerBounds.size()); - Assert.assertTrue(lowerBounds.containsKey(recordId.fieldId())); - ByteBuffer recordDataLowerBound = lowerBounds.get(recordData.fieldId()); - Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataLowerBound).length); - - Map upperBounds = file.upperBounds(); - Assert.assertEquals(2, upperBounds.size()); - Assert.assertTrue(upperBounds.containsKey(recordId.fieldId())); - ByteBuffer recordDataUpperBound = upperBounds.get(recordData.fieldId()); - Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataUpperBound).length); - } - } -} diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java deleted file mode 100644 index 554557df416c..000000000000 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.spark.source; - -import java.util.Objects; - -public class ThreeColumnRecord { - private Integer c1; - private String c2; - private String c3; - - public ThreeColumnRecord() {} - - public ThreeColumnRecord(Integer c1, String c2, String c3) { - this.c1 = c1; - this.c2 = c2; - this.c3 = c3; - } - - public Integer getC1() { - return c1; - } - - public void setC1(Integer c1) { - this.c1 = c1; - } - - public String getC2() { - return c2; - } - - public void setC2(String c2) { - this.c2 = c2; - } - - public String getC3() { - return c3; - } - - public void setC3(String c3) { - this.c3 = c3; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) - && Objects.equals(c2, that.c2) - && Objects.equals(c3, that.c3); - } - - @Override - public int hashCode() { - return Objects.hash(c1, c2, c3); - } - - @Override - public String toString() { - return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; - } -} diff --git a/spark/v2.4/spark/src/test/resources/data/books.json b/spark/v2.4/spark/src/test/resources/data/books.json deleted file mode 100644 index 902b4e316f01..000000000000 --- a/spark/v2.4/spark/src/test/resources/data/books.json +++ /dev/null @@ -1,6 +0,0 @@ -{"title":"Gone", "price":12, "author": "Michael Grant", "published": 1541776051, "genre": "fiction"} -{"title":"Carry On", "price":10, "author": "Rainbow Rowell", "published": 1536505651, "genre": "fiction"} -{"title":"Warward Son", "price":12, "author": "Rainbow Rowell", "published": 1504969651, "genre": "fiction"} -{"title":"Heroes", "price":8, "author": "Stephen Fry", "published": 1504969651, "genre": "fiction"} -{"title":"Vietnam", "price":15, "author": "Max Hastings", "genre": "non-fiction"} - diff --git a/spark/v2.4/spark/src/test/resources/data/new-books.json b/spark/v2.4/spark/src/test/resources/data/new-books.json deleted file mode 100644 index 3418151c8fb2..000000000000 --- a/spark/v2.4/spark/src/test/resources/data/new-books.json +++ /dev/null @@ -1,4 +0,0 @@ -{"title":"Harry Potter", "price":12, "author": "JK Rowling", "published": 1570719361, "genre": "fiction", "publisher": "ACME Books"} -{"title":"Percy Jackson", "price":10, "author": "Rick Riordan", "published": 1547132161, "genre": "fiction", "publisher": "ACME Books"} -{"title":"Cookie", "price":8, "author": "Jacqueline Wilson", "published": 1552229761, "genre": "fiction", "publisher": "ACME Books"} -{"title":"Fangirl", "price":12, "author": "Rainbow Rowell", "published": 1552229761, "genre": "fiction", "publisher": "ACME Books"}