Skip to content

Commit

Permalink
DRILL-4028: Update Drill to leverage latest version of Parquet library.
Browse files Browse the repository at this point in the history
- Remove references to the shaded version of a Jackson @JsonCreator annotation from parquet, replace with proper fasterxml version.
- Fixing imports using the wrong parquet packages after rebase.
- Fixing issues with Drill parquet read a write path after merging the Drill parquet fork back into mainline.
- Fixed the issue with the writer, needed to flush the RecordConsumer in the ParquetRecordWriter.
- Consolidate page reading code
- Added some test to print out some additional context when an ordered comparison of two datasets fails in a test.
- Fix up parquet API usage in Hive Module.
- Adding unit test to read a write all types in parquet, the decimal types and interval year have some issues.
- Use direct codec factory from new package in the parquet library now that it has been moved.
- Moving the test for Direct Codec Factory out of the Drill source as the class itself has been moved.
- Small fix after consolidating two different ByteBuffer based implementations of BytesInput.
- Small fixes to accommodate interface changes.
- Small changes to remove direct references to DirectCodecFactory, this class is not accessible outside of parquet, but an instance with the same contract is now accessible with a new factory method on CodecFactory.
- Fixed failing test using miniDFS when reading a larger parquet file.

This closes #236
  • Loading branch information
jaltekruse authored and jacques-n committed Nov 5, 2015
1 parent 18a1ae4 commit 39582bd
Show file tree
Hide file tree
Showing 55 changed files with 519 additions and 1,126 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;

import com.fasterxml.jackson.annotation.JsonCreator;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.SchemaPath;
Expand All @@ -52,8 +53,6 @@
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.HTable;

import parquet.org.codehaus.jackson.annotate.JsonCreator;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ public static class HBaseSubScanSpec {
protected byte[] stopRow;
protected byte[] serializedFilter;

@parquet.org.codehaus.jackson.annotate.JsonCreator
@JsonCreator
public HBaseSubScanSpec(@JsonProperty("tableName") String tableName,
@JsonProperty("regionServer") String regionServer,
@JsonProperty("startRow") byte[] startRow,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.exec.store.RecordReader;
import org.apache.drill.exec.store.parquet.DirectCodecFactory;
import org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator;
import org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader;
import org.apache.drill.exec.util.ImpersonationUtil;
import org.apache.hadoop.conf.Configuration;
Expand All @@ -50,9 +50,10 @@
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.CodecFactory;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;

@SuppressWarnings("unused")
public class HiveDrillNativeScanBatchCreator implements BatchCreator<HiveDrillNativeParquetSubScan> {
Expand Down Expand Up @@ -125,7 +126,8 @@ public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan
context,
Path.getPathWithoutSchemeAndAuthority(finalPath).toString(),
rowGroupNum, fs,
new DirectCodecFactory(fs.getConf(), oContext.getAllocator()),
CodecFactory.createDirectCodecFactory(fs.getConf(),
new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0),
parquetMetadata,
newColumns)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
import java.util.Set;
import java.util.concurrent.TimeUnit;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import org.apache.commons.lang3.StringUtils;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
Expand All @@ -53,12 +58,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import parquet.org.codehaus.jackson.annotate.JsonCreator;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ public static class MongoSubScanSpec {

protected BasicDBObject filter;

@parquet.org.codehaus.jackson.annotate.JsonCreator
@JsonCreator
public MongoSubScanSpec(@JsonProperty("dbName") String dbName,
@JsonProperty("collectionName") String collectionName,
@JsonProperty("hosts") List<String> hosts,
Expand Down
16 changes: 8 additions & 8 deletions exec/java-exec/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand All @@ -174,7 +174,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand All @@ -189,9 +189,9 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format</artifactId>
<version>2.1.1-drill-r1</version>
<version>2.3.0-incubating</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
Expand All @@ -204,7 +204,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand All @@ -219,7 +219,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-jackson</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand All @@ -234,7 +234,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand All @@ -249,7 +249,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-generator</artifactId>
<version>${parquet.version}</version>
<exclusions>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,13 @@

package org.apache.drill.exec.store;

import com.fasterxml.jackson.core.JsonGenerator;
import com.google.common.collect.Lists;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.expr.holders.*;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.store.EventBasedRecordWriter.FieldConverter;
import org.apache.drill.exec.store.parquet.ParquetTypeHelper;
import org.apache.drill.exec.vector.*;
import org.apache.drill.exec.util.DecimalUtility;
import org.apache.drill.exec.vector.complex.reader.FieldReader;
import parquet.io.api.RecordConsumer;
import parquet.schema.MessageType;
import parquet.io.api.Binary;
import io.netty.buffer.ByteBuf;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.vector.complex.fn.JsonOutput;



import org.apache.drill.common.types.TypeProtos;

import org.joda.time.DateTimeUtils;

import java.io.IOException;
import java.lang.UnsupportedOperationException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
* Abstract implementation of RecordWriter interface which exposes interface:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*/

import org.joda.time.DateTimeUtils;
import parquet.io.api.Binary;
import org.apache.parquet.io.api.Binary;

import java.lang.Override;
import java.lang.RuntimeException;
Expand All @@ -39,9 +39,9 @@
import org.apache.drill.exec.vector.*;
import org.apache.drill.exec.util.DecimalUtility;
import org.apache.drill.exec.vector.complex.reader.FieldReader;
import parquet.io.api.RecordConsumer;
import parquet.schema.MessageType;
import parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.io.api.Binary;
import io.netty.buffer.DrillBuf;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.MaterializedField;
Expand Down
16 changes: 8 additions & 8 deletions exec/java-exec/src/main/codegen/templates/ParquetTypeHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
*/

import org.apache.drill.common.types.MinorType;
import parquet.format.ConvertedType;
import parquet.schema.DecimalMetadata;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.format.ConvertedType;
import org.apache.parquet.schema.DecimalMetadata;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;

<@pp.dropOutputFile />
<@pp.changeOutputFile name="org/apache/drill/exec/store/parquet/ParquetTypeHelper.java" />
Expand All @@ -31,10 +31,10 @@
import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.record.MaterializedField;
import parquet.schema.OriginalType;
import parquet.schema.DecimalMetadata;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
import parquet.schema.Type.Repetition;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.DecimalMetadata;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;

import java.util.HashMap;
import java.util.Map;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
import org.apache.drill.exec.vector.ValueVector;
import org.apache.calcite.rel.RelFieldCollation.Direction;

import parquet.Preconditions;
import com.google.common.base.Preconditions;

import com.google.common.collect.Lists;
import com.sun.codemodel.JConditional;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import parquet.Preconditions;
import com.google.common.base.Preconditions;

@XmlRootElement
public class QueryWrapper {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@

import org.apache.hadoop.fs.FSDataInputStream;

import parquet.bytes.BytesInput;
import parquet.format.PageHeader;
import parquet.format.Util;
import parquet.hadoop.util.CompatibilityUtil;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.Util;
import org.apache.parquet.hadoop.util.CompatibilityUtil;

public class ColumnDataReader {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ColumnDataReader.class);
Expand All @@ -46,6 +46,10 @@ public PageHeader readPageHeader() throws IOException{
return Util.readPageHeader(input);
}

public FSDataInputStream getInputStream() {
return input;
}

public BytesInput getPageAsBytesInput(int pageLength) throws IOException{
byte[] b = new byte[pageLength];
input.read(b);
Expand All @@ -55,8 +59,9 @@ public BytesInput getPageAsBytesInput(int pageLength) throws IOException{
public void loadPage(DrillBuf target, int pageLength) throws IOException {
target.clear();
ByteBuffer directBuffer = target.nioBuffer(0, pageLength);
while (directBuffer.remaining() > 0) {
CompatibilityUtil.getBuf(input, directBuffer, directBuffer.remaining());
int lengthLeftToRead = pageLength;
while (lengthLeftToRead > 0) {
lengthLeftToRead -= CompatibilityUtil.getBuf(input, directBuffer, lengthLeftToRead);
}
target.writerIndex(pageLength);
}
Expand Down
Loading

0 comments on commit 39582bd

Please sign in to comment.