Skip to content

Commit

Permalink
[FLINK-10365] [s3] Factor out Hadoop FS classes into pre-shaded artif…
Browse files Browse the repository at this point in the history
…act and update to Hadoop 3
  • Loading branch information
StephanEwen committed Sep 19, 2018
1 parent 3292dc5 commit b0e5642
Show file tree
Hide file tree
Showing 9 changed files with 9,591 additions and 1 deletion.
32 changes: 32 additions & 0 deletions flink-filesystems/flink-fs-hadoop-shaded/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
This project bundles the minimal dependencies from Hadoop's
FileSystem abstraction and shades them to avoid dependency conflicts.

This project is the basis for the bundled File System adapters
that are based on Hadoop code, but keep the appearance of Flink
being Hadoop-free, from a dependency perspective.

For this to work, however, we needed to adapt Hadoop's `Configuration`
class to load a (shaded) `core-default-shaded.xml` configuration with the
relocated class names of classes loaded via reflection.

# Changing the Hadoop Version

If you want to change the Hadoop version this project depends on, the following
steps are required to keep the shading correct:

1. from the respective Hadoop jar (currently 3.1.0),
- copy `org/apache/hadoop/conf/Configuration.java` to `src/main/java/org/apache/hadoop/conf/` and
- replace `core-default.xml` with `core-default-shaded.xml`.
- copy `org/apache/hadoop/util/NativeCodeLoader.java` to `src/main/java/org/apache/hadoop/util/` and
- replace the native method stubs as in the current setup (empty methods, or return false)
- copy `core-default.xml` to `src/main/resources/core-default-shaded.xml` and
- change every occurrence of `org.apache.hadoop` into `org.apache.flink.fs.shaded.hadoop3.org.apache.hadoop`
- copy `core-site.xml` to `src/test/resources/core-site.xml` (as is)

2. verify the shaded jar:
- does not contain any unshaded classes
- all other classes should be under `org.apache.flink.fs.shaded.hadoop3`
- there should be a `META-INF/services/org.apache.flink.core.fs.FileSystemFactory` file pointing to two classes: `org.apache.flink.fs.s3hadoop.S3FileSystemFactory` and `org.apache.flink.fs.s3hadoop.S3AFileSystemFactory`
- other service files under `META-INF/services` should have their names and contents in the relocated `org.apache.flink.fs.s3hadoop.shaded` package
- contains a `core-default-shaded.xml` file
- does not contain a `core-default.xml` or `core-site.xml` file
262 changes: 262 additions & 0 deletions flink-filesystems/flink-fs-hadoop-shaded/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.apache.flink</groupId>
<artifactId>flink-filesystems</artifactId>
<version>1.7-SNAPSHOT</version>
<relativePath>..</relativePath>
</parent>

<artifactId>flink-fs-hadoop-shaded</artifactId>
<name>flink-filesystems :: flink-fs-hadoop-shaded</name>

<packaging>jar</packaging>

<dependencies>
<!-- The Hadoop file system abstraction -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${fs.hadoopshaded.version}</version>
<exclusions>
<exclusion>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
</exclusion>
<exclusion>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-servlet</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-json</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-server</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-webapp</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>jsp-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.kerby</groupId>
<artifactId>kerb-simplekdc</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
</exclusion>
<exclusion>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
</exclusion>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</exclusion>
<exclusion>
<groupId>com.nimbusds</groupId>
<artifactId>nimbus-jose-jwt</artifactId>
</exclusion>
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>

<build>

<!-- this is merely an intermediate build artifact and should not be -->
<!-- deployed to maven central -->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>

<!-- publish the core-site.xml for tests -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>

<!-- relocate all dependencies to hide them -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<id>shade-flink</id>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<shadeTestJar>false</shadeTestJar>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<relocations>
<!-- we shade only the parts that are internal to Hadoop and not used / exposed downstream -->
<relocation>
<pattern>com.google.re2j</pattern>
<shadedPattern>org.apache.flink.fs.shaded.hadoop3.com.google.re2j</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.htrace</pattern>
<shadedPattern>org.apache.flink.fs.shaded.hadoop3.org.apache.htrace</shadedPattern>
</relocation>
<relocation>
<pattern>com.fasterxml</pattern>
<shadedPattern>org.apache.flink.fs.shaded.hadoop3.com.fasterxml</shadedPattern>
</relocation>
<relocation>
<pattern>org.codehaus</pattern>
<shadedPattern>org.apache.flink.fs.shaded.hadoop3.org.codehaus</shadedPattern>
</relocation>
<relocation>
<pattern>com.ctc</pattern>
<shadedPattern>org.apache.flink.fs.shaded.hadoop3.com.ctc</shadedPattern>
</relocation>
</relocations>
<filters>
<!-- remove the classes from Hadoop that we replace with our overwritten implementation -->
<filter>
<artifact>org.apache.hadoop:hadoop-common</artifact>
<excludes>
<exclude>org/apache/hadoop/conf/Configuration**</exclude>
<exclude>org/apache/hadoop/util/NativeCodeLoader**</exclude>
<exclude>org/apache/hadoop/util/VersionInfo**</exclude>
<exclude>core-default.xml</exclude>
<exclude>common-version-info.properties</exclude>
<exclude>org.apache.hadoop.application-classloader.properties</exclude>
</excludes>
</filter>
<filter>
<artifact>*</artifact>
<excludes>
<exclude>properties.dtd</exclude>
<exclude>PropertyList-1.0.dtd</exclude>
<exclude>META-INF/maven/**</exclude>
<exclude>META-INF/services/javax.xml.stream.*</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Loading

0 comments on commit b0e5642

Please sign in to comment.