PARQUET-105: use mvn shade plugin to create uber jar, support meta on…

… a folder 1. Make hadoop dependency from parquet-tools so it is provided. It can be used against different version of hadoop 2. Use maven shade plugin to create a all in one jar, which can be used both locally or in hadoop 3. Make parquet-meta command support both folder(read summary file) and a single file Author: Tianshuo Deng <[email protected]> Closes apache#69 from tsdeng/bundle_parquet_tools and squashes the following commits: d8dcd3e [Tianshuo Deng] print file offset, file path, and cancel autoCrop a2d1399 [Tianshuo Deng] support local mode 5009a85 [Tianshuo Deng] fix README 0756f81 [Tianshuo Deng] remove semver check for parquet_tools 78c7f4b [Tianshuo Deng] use mvn shade plugin to create uber jar, support meta on a folder
JunchengMa · Oct 21, 2014 · 31fb4df · 31fb4df
1 parent be1222e
commit 31fb4df
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 41 deletions.
diff --git a/parquet-tools/README.md b/parquet-tools/README.md
@@ -6,40 +6,76 @@ in the inspection of [Parquet files](https://github.com/Parquet).
 
 Currently these tools are available for UN*X systems.
 
-## Usage
+## Build
+
+If you want to use parquet-tools in local mode, you should use the local profile so the 
+hadoop client dependency is included.
+
+```sh
+cd parquet-tools && mvn clean package -Plocal 
+```
+
+To use it in hadoop mode, the default profile will exclude the hadoop client dependency
+
+```sh
+cd parquet-tools && mvn clean package 
+```
+
+The resulting jar is target/parquet-tools-<Version>.jar, you can copy it to the place where you
+want to use it
+
+#Run from hadoop
+
+See Commands Usage for command to use
+
+```sh
+hadoop jar ./parquet-tools-<VERSION>.jar <command> my_parquet_file.lzo.parquet
+```
+
+#Run locally
+
+See Commands Usage for command to use
+
+```
+java jar ./parquet-tools-<VERSION>.jar <command> my_parquet_file.lzo.parquet
+```
+
+## Commands Usage
+
+To run it on hadoop, you should use "hadoop jar" instead of "java jar"
 
 ```sh
-usage: parquet-tools cat [option...] <input>
+usage: java jar ./parquet-tools-<VERSION>.jar cat [option...] <input>
 where option is one of:
        --debug     Disable color output even if supported
     -h,--help      Show this help string
        --no-color  Disable color output even if supported
 where <input> is the parquet file to print to stdout
 
-usage: parquet-tools head [option...] <input>
+usage: java jar ./parquet-tools-<VERSION>.jar head [option...] <input>
 where option is one of:
        --debug          Disable color output even if supported
     -h,--help           Show this help string
     -n,--records <arg>  The number of records to show (default: 5)
        --no-color       Disable color output even if supported
 where <input> is the parquet file to print to stdout
 
-usage: parquet-tools schema [option...] <input>
+usage: java jar ./parquet-tools-<VERSION>.jar schema [option...] <input>
 where option is one of:
     -d,--detailed <arg>  Show detailed information about the schema.
        --debug           Disable color output even if supported
     -h,--help            Show this help string
        --no-color        Disable color output even if supported
 where <input> is the parquet file containing the schema to show
 
-usage: parquet-tools meta [option...] <input>
+usage: java jar ./parquet-tools-<VERSION>.jar meta [option...] <input>
 where option is one of:
        --debug     Disable color output even if supported
     -h,--help      Show this help string
        --no-color  Disable color output even if supported
 where <input> is the parquet file to print to stdout
 
-usage: parquet-tools dump [option...] <input>
+usage: java jar dump [option...] <input>
 where option is one of:
     -c,--column <arg>  Dump only the given column, can be specified more than
                        once

diff --git a/parquet-tools/pom.xml b/parquet-tools/pom.xml
@@ -15,8 +15,18 @@
   <url>https://github.com/Parquet/parquet-mr</url>
 
   <properties>
+      <hadoop.scope>provided</hadoop.scope>
   </properties>
 
+  <profiles>
+    <profile>
+      <id>local</id>
+      <properties>
+        <hadoop.scope>compile</hadoop.scope>
+      </properties>
+    </profile>
+  </profiles>
+
   <dependencies>
     <dependency>
       <groupId>com.twitter</groupId>
@@ -32,6 +42,7 @@
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
+      <scope>${hadoop.scope}</scope>
     </dependency>
     <dependency>
       <groupId>commons-cli</groupId>
@@ -47,28 +58,40 @@
 
   <build>
     <plugins>
+      <!--We do not turn on semver checking for parquet-tools, since it's not considered as an API-->
       <plugin>
-        <artifactId>maven-enforcer-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
         <configuration>
-          <descriptors>
-            <descriptor>src/main/assembly/assembly.xml</descriptor>
-          </descriptors>
+          <archive>
+            <manifest>
+              <mainClass>parquet.tools.Main</mainClass>
+            </manifest>
+          </archive>
         </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
         <executions>
           <execution>
-            <id>make-assembly</id>
             <phase>package</phase>
             <goals>
-              <goal>single</goal>
+              <goal>shade</goal>
             </goals>
+            <configuration>
+              <minimizeJar>false</minimizeJar>
+              <artifactSet>
+                <includes>
+                  <include>*</include>
+                </includes>
+              </artifactSet>
+            </configuration>
           </execution>
         </executions>
       </plugin>
     </plugins>
   </build>
 
-
 </project>
diff --git a/parquet-tools/src/main/java/parquet/tools/Main.java b/parquet-tools/src/main/java/parquet/tools/Main.java
@@ -165,21 +165,20 @@ public static void main(String[] args) {
     Main.out = System.out;
     Main.err = System.err;
 
-    System.setOut(new PrintStream(new OutputStream() {
-      @Override public void write(int b) throws IOException { }
-      @Override public void write(byte[] b) throws IOException { }
-      @Override public void write(byte[] b, int off, int len) throws IOException { }
-      @Override public void flush() throws IOException { }
-      @Override public void close() throws IOException { }
-    }));
-
-    System.setErr(new PrintStream(new OutputStream() {
-      @Override public void write(int b) throws IOException { }
-      @Override public void write(byte[] b) throws IOException { }
-      @Override public void write(byte[] b, int off, int len) throws IOException { }
-      @Override public void flush() throws IOException { }
-      @Override public void close() throws IOException { }
-    }));
+    PrintStream VoidStream = new PrintStream(new OutputStream() {
+      @Override
+      public void write(int b) throws IOException {}
+      @Override
+      public void write(byte[] b) throws IOException {}
+      @Override
+      public void write(byte[] b, int off, int len) throws IOException {}
+      @Override
+      public void flush() throws IOException {}
+      @Override
+      public void close() throws IOException {}
+    });
+    System.setOut(VoidStream);
+    System.setErr(VoidStream);
 
     if (args.length == 0) {
       die("No command specified", true, null, null);

diff --git a/parquet-tools/src/main/java/parquet/tools/command/ShowMetaCommand.java b/parquet-tools/src/main/java/parquet/tools/command/ShowMetaCommand.java
@@ -15,21 +15,23 @@
  */
 package parquet.tools.command;
 
-import java.io.PrintWriter;
+import static parquet.format.converter.ParquetMetadataConverter.NO_FILTER;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 
+import parquet.hadoop.Footer;
 import parquet.hadoop.ParquetFileReader;
 import parquet.hadoop.metadata.ParquetMetadata;
 import parquet.tools.util.MetadataUtils;
 import parquet.tools.util.PrettyPrintWriter;
 import parquet.tools.util.PrettyPrintWriter.WhiteSpaceHandler;
 
+import java.util.List;
+
 public class ShowMetaCommand extends ArgsOnlyCommand {
-  public static final String TABS = "    ";
-  public static final int BLOCK_BUFFER_SIZE = 64 * 1024;
   public static final String[] USAGE = new String[] {
     "<input>",
     "where <input> is the parquet file to print to stdout"
@@ -52,16 +54,20 @@ public void execute(CommandLine options) throws Exception {
     String input = args[0];
 
     Configuration conf = new Configuration();
-    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
+    Path inputPath = new Path(input);
+    FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
+    List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);
 
     PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                              .withAutoColumn()
-                                             .withAutoCrop()
                                              .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                              .withColumnPadding(1)
                                              .build();
 
-    MetadataUtils.showDetails(out, metaData);
-    out.flushColumns();
+    for(Footer f: footers) {
+      out.format("file: %s%n" , f.getFile());
+      MetadataUtils.showDetails(out, f.getParquetMetadata());
+      out.flushColumns();
+    }
   }
 }
diff --git a/parquet-tools/src/main/java/parquet/tools/command/ShowSchemaCommand.java b/parquet-tools/src/main/java/parquet/tools/command/ShowSchemaCommand.java
@@ -32,8 +32,6 @@
 import parquet.tools.util.PrettyPrintWriter;
 
 public class ShowSchemaCommand extends ArgsOnlyCommand {
-  public static final DecimalFormat FRACTIONAL = new DecimalFormat("#,##0.##");
-  public static final DecimalFormat WHOLE = new DecimalFormat("#,##0");
   public static final String[] USAGE = new String[] {
     "<input>",
     "where <input> is the parquet file containing the schema to show"

diff --git a/parquet-tools/src/main/java/parquet/tools/util/MetadataUtils.java b/parquet-tools/src/main/java/parquet/tools/util/MetadataUtils.java
@@ -79,8 +79,9 @@ public static void showDetails(PrettyPrintWriter out, BlockMetaData meta) {
   private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
     long rows = meta.getRowCount();
     long tbs = meta.getTotalByteSize();
+    long offset = meta.getStartingPos();
 
-    out.format("row group%s: RC:%d TS:%d%n", (num == null ? "" : " " + num), rows, tbs);
+    out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
     out.rule('-');
     showDetails(out, meta.getColumns());
   }