Skip to content

Commit

Permalink
Merge pull request apache#1141 from abti/a2o
Browse files Browse the repository at this point in the history
Renamed partitionDir.prefixLocationHint to source.dataPathIdentifier to be more consistent with naming across Hive data conversion
  • Loading branch information
abti authored Jul 25, 2016
2 parents a0eea51 + ad9b629 commit 8329710
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroS
// .. daily_2016-01-01-00 and hourly_2016-01-01-00
// This helps existing hourly data from not being deleted at the time of roll up, and so Hive queries in flight
// .. do not fail
List<String> partitionDirPrefixHint = getConversionConfig().getPartitionDirPrefixHint();
List<String> partitionDirPrefixHint = getConversionConfig().getSourceDataPathIdentifier();

// Populate optional partition info
Map<String, String> partitionsDDLInfo = Maps.newHashMap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,43 @@ public static class ConversionConfig {
private static final String HIVE_RUNTIME_PROPERTIES_KEY_PREFIX = "hiveRuntime";

/***
* Comma separated list of string that should be used as prefix for destination table partition location dir name
* .. if present in the location path string of source destination
* This helps with rollup situations where hourly partitions roll up to 0th hour for daily but we do not want to
* .. to overwrite data so that the queries in flight do not fail; instead the Hive metadata is updated to new
* .. directory location
* Comma separated list of string that should be used as a prefix for destination partition directory name
* ... (if present in the location path string of source partition)
*
* This is helpful in roll-up / compaction scenarios, where you don't want queries in flight to fail.
*
* Scenario without this property:
* - Source partition: datepartition=2016-01-01-00 with path /foo/bar/hourly/2016/01/01/00 is available for
* processing
* - Source partition is processed and published to destination table as: /foo/bar_orc/datepartition=2016-01-01-00
*
* - Source partition: datepartition=2016-01-01-00 with path /foo/bar/daily/2016/01/01/00 is available again for
* processing (due to roll-up / compaction of hourly data for 2016-01-01 into same partition)
* - Source partition is processed and published to destination table as: /foo/bar_orc/datepartition=2016-01-01-00
* (previous data is overwritten and any queries in flight fail)
*
* Same scenario with this property set to "hourly,daily":
* - Source partition: datepartition=2016-01-01-00 with path /foo/bar/hourly/2016/01/01/00 is available for
* processing
* - Source partition is processed and published to destination table as: /foo/bar_orc/hourly_datepartition=2016-01-01-00
* (Note: "hourly_" is prefixed to destination partition directory name because source partition path contains
* "hourly" substring)
*
* - Source partition: datepartition=2016-01-01-00 with path /foo/bar/daily/2016/01/01/00 is available again for
* processing (due to roll-up / compaction of hourly data for 2016-01-01 into same partition)
* - Source partition is processed and published to destination table as: /foo/bar_orc/daily_datepartition=2016-01-01-00
* (Note: "daily_" is prefixed to destination partition directory name, because source partition path contains
* "daily" substring)
* - Any running queries are not impacted since data is not overwritten and hourly_datepartition=2016-01-01-00
* directory continues to exist
*
* Notes:
* - This however leaves the responsibility of cleanup of previous destination partition directory on retention or
* other such independent module, since in the above case hourly_datepartition=2016-01-01-00 dir will not be deleted
* - Directories can still be overwritten if they resolve to same destination partition directory name, such as
* re-processing / backfill of daily partition will overwrite daily_datepartition=2016-01-01-00 directory
*/
private static final String PARTITION_DIR_PREFIX_LOCATION_HINT = "partitionDir.prefixLocationHint";
private static final String SOURCE_DATA_PATH_IDENTIFIER_KEY = "source.dataPathIdentifier";

private final String destinationFormat;
private final String destinationTableName;
Expand All @@ -159,8 +189,7 @@ public static class ConversionConfig {
private final Properties hiveRuntimeProperties;
private final boolean evolutionEnabled;
private final Optional<Integer> rowLimit;
private final Optional<String> hiveVersion;
private final List<String> partitionDirPrefixHint;
private final List<String> sourceDataPathIdentifier;

private ConversionConfig(Config config, Table table, String destinationFormat) {

Expand All @@ -183,8 +212,7 @@ private ConversionConfig(Config config, Table table, String destinationFormat) {
.configToProperties(ConfigUtils.getConfig(config, HIVE_RUNTIME_PROPERTIES_KEY_PREFIX, ConfigFactory.empty()));
this.evolutionEnabled = ConfigUtils.getBoolean(config, EVOLUTION_ENABLED, false);
this.rowLimit = Optional.fromNullable(ConfigUtils.getInt(config, ROW_LIMIT_KEY, null));
this.hiveVersion = Optional.fromNullable(ConfigUtils.getString(config, HIVE_VERSION_KEY, null));
this.partitionDirPrefixHint = ConfigUtils.getStringList(config, PARTITION_DIR_PREFIX_LOCATION_HINT);
this.sourceDataPathIdentifier = ConfigUtils.getStringList(config, SOURCE_DATA_PATH_IDENTIFIER_KEY);
}
}

Expand Down

0 comments on commit 8329710

Please sign in to comment.