Skip to content

Commit

Permalink
Fail query when the symlink file contains non-existent paths
Browse files Browse the repository at this point in the history
  • Loading branch information
findinpath authored and findepi committed Oct 12, 2023
1 parent 331d482 commit c7e96a6
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -525,12 +525,15 @@ Iterator<InternalHiveSplit> buildManifestFileIterator(InternalHiveSplitFactory s
checkPartitionLocationExists(trinoFileSystem, location);
}
fileStatusIterator.forEachRemaining(status -> fileStatuses.put(Location.of(status.getPath()).path(), status));

List<TrinoFileStatus> locatedFileStatuses = paths.stream()
.map(path -> fileStatuses.get(path.path()))
.toList();

return createInternalHiveSplitIterator(splitFactory, splittable, Optional.empty(), locatedFileStatuses.stream());
Stream<TrinoFileStatus> fileStream = paths.stream()
.map(path -> {
TrinoFileStatus status = fileStatuses.get(path.path());
if (status == null) {
throw new TrinoException(HIVE_FILE_NOT_FOUND, "Manifest file from the location [%s] contains non-existent path: %s".formatted(location, path));
}
return status;
});
return createInternalHiveSplitIterator(splitFactory, splittable, Optional.empty(), fileStream);
}

private ListenableFuture<Void> getTransactionalSplits(Location path, boolean splittable, Optional<BucketConversion> bucketConversion, InternalHiveSplitFactory splitFactory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import static io.trino.tests.product.utils.QueryExecutors.onTrino;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

public class TestParquetSymlinkInputFormat
{
Expand Down Expand Up @@ -66,6 +67,33 @@ public void testSymlinkTable()
hdfsClient.delete(dataDir);
}

@Test(groups = STORAGE_FORMATS)
public void testSymlinkTableWithSymlinkFileContainingNonExistentPath()
throws Exception
{
String table = "test_parquet_invalid_symlink";
onHive().executeQuery("DROP TABLE IF EXISTS " + table);

onHive().executeQuery("" +
"CREATE TABLE " + table +
"(col int) " +
"ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " +
"STORED AS " +
"INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' " +
"OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'");

String tableRoot = warehouseDirectory + '/' + table;
String dataDir = warehouseDirectory + "/data_test_parquet_invalid_symlink";

saveResourceOnHdfs("data.parquet", dataDir + "/data.parquet");
hdfsClient.saveFile(tableRoot + "/symlink.txt", format("hdfs:%s/data.parquet\nhdfs:%s/missingfile.parquet", dataDir, dataDir));
assertThatThrownBy(() -> onTrino().executeQuery("SELECT * FROM " + table))
.hasMessageMatching(".*Manifest file from the location \\[.*data_test_parquet_invalid_symlink\\] contains non-existent path:.*missingfile.parquet");

onHive().executeQuery("DROP TABLE " + table);
hdfsClient.delete(dataDir);
}

@Test(groups = {AVRO, STORAGE_FORMATS})
public void testSymlinkTableWithMultipleParentDirectories()
throws Exception
Expand Down

0 comments on commit c7e96a6

Please sign in to comment.