HIVE-9298 : Support reading alternate timestamp formats (Jason Dere v…

…ia Ashutosh Chauhan) git-svn-id: https://svn.apache.org/repos/asf/hive/trunk@1656519 13f79535-47bb-0310-9956-ffa450edef68
akitanaka · Feb 2, 2015 · 95ab614 · 95ab614
1 parent e28ec1b
commit 95ab614
Show file tree

Hide file tree

Showing 42 changed files with 1,299 additions and 174 deletions.
diff --git a/common/pom.xml b/common/pom.xml
@@ -55,6 +55,11 @@
       <artifactId>commons-logging</artifactId>
       <version>${commons-logging.version}</version>
     </dependency>
+    <dependency>
+      <groupId>joda-time</groupId>
+      <artifactId>joda-time</artifactId>
+      <version>${joda.version}</version>
+    </dependency>
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>

diff --git a/common/src/java/org/apache/hive/common/util/HiveStringUtils.java b/common/src/java/org/apache/hive/common/util/HiveStringUtils.java
@@ -45,6 +45,7 @@
 import org.apache.hadoop.hive.common.classification.InterfaceAudience;
 import org.apache.hadoop.hive.common.classification.InterfaceStability;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
 
 /**
  * HiveStringUtils
@@ -491,6 +492,34 @@ public static String[] split(
     return strList.toArray(new String[strList.size()]);
   }
 
+  /**
+   * Split a string using the default separator/escape character,
+   * then unescape the resulting array of strings
+   * @param str
+   * @return an array of unescaped strings
+   */
+  public static String[] splitAndUnEscape(String str) {
+    return splitAndUnEscape(str, ESCAPE_CHAR, COMMA);
+  }
+
+  /**
+   * Split a string using the specified separator/escape character,
+   * then unescape the resulting array of strings using the same escape/separator.
+   * @param str a string that may have escaped separator
+   * @param escapeChar a char that be used to escape the separator
+   * @param separator a separator char
+   * @return an array of unescaped strings
+   */
+  public static String[] splitAndUnEscape(String str, char escapeChar, char separator) {
+    String[] result = split(str, escapeChar, separator);
+    if (result != null) {
+      for (int idx = 0; idx < result.length; ++idx) {
+        result[idx] = unEscapeString(result[idx], escapeChar, separator);
+      }
+    }
+    return result;
+  }
+
   /**
    * Finds the first occurrence of the separator character ignoring the escaped
    * separators starting from the index. Note the substring between the index

diff --git a/common/src/java/org/apache/hive/common/util/TimestampParser.java b/common/src/java/org/apache/hive/common/util/TimestampParser.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.common.util;
+
+import java.math.BigDecimal;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.joda.time.DateTime;
+import org.joda.time.MutableDateTime;
+import org.joda.time.DateTimeFieldType;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.DateTimeFormatterBuilder;
+import org.joda.time.format.DateTimeParser;
+import org.joda.time.format.DateTimeParserBucket;
+
+/**
+ * Timestamp parser using Joda DateTimeFormatter. Parser accepts 0 or more date time format
+ * patterns. If no format patterns are provided it will default to the normal Timestamp parsing.
+ * Datetime formats are compatible with Java SimpleDateFormat. Also added special case pattern
+ * "millis" to parse the string as milliseconds since Unix epoch.
+ * Since this uses Joda DateTimeFormatter, this parser should be thread safe.
+ */
+public class TimestampParser {
+
+  protected final static String[] stringArray = new String[] {};
+  protected final static String millisFormatString = "millis";
+  protected final static DateTime startingDateValue = new DateTime(1970, 1, 1, 0, 0, 0, 0);
+
+  protected String[] formatStrings = null;
+  protected DateTimeFormatter fmt = null;
+
+  public TimestampParser() {
+  }
+
+  public TimestampParser(TimestampParser tsParser) {
+    this(tsParser.formatStrings == null ?
+        null : Arrays.copyOf(tsParser.formatStrings, tsParser.formatStrings.length));
+  }
+
+  public TimestampParser(List<String> formatStrings) {
+    this(formatStrings == null ? null : formatStrings.toArray(stringArray));
+  }
+
+  public TimestampParser(String[] formatStrings) {
+    this.formatStrings = formatStrings;
+
+    // create formatter that includes all of the input patterns
+    if (formatStrings != null && formatStrings.length > 0) {
+      DateTimeParser[] parsers = new DateTimeParser[formatStrings.length];
+      for (int idx = 0; idx < formatStrings.length; ++idx) {
+        String formatString = formatStrings[idx];
+        if (formatString.equalsIgnoreCase(millisFormatString)) {
+          // Use milliseconds parser if pattern matches our special-case millis pattern string
+          parsers[idx] = new MillisDateFormatParser();
+        } else {
+          parsers[idx] = DateTimeFormat.forPattern(formatString).getParser();
+        }
+      }
+      fmt = new DateTimeFormatterBuilder().append(null, parsers).toFormatter();
+    }
+  }
+
+  /**
+   * Parse the input string and return a timestamp value
+   * @param strValue
+   * @return
+   * @throws IllegalArgumentException if input string cannot be parsed into timestamp
+   */
+  public Timestamp parseTimestamp(String strValue) throws IllegalArgumentException {
+    if (fmt != null) {
+      // reset value in case any date fields are missing from the date pattern
+      MutableDateTime mdt = new MutableDateTime(startingDateValue);
+
+      // Using parseInto() avoids throwing exception when parsing,
+      // allowing fallback to default timestamp parsing if custom patterns fail.
+      int ret = fmt.parseInto(mdt, strValue, 0);
+      // Only accept parse results if we parsed the entire string
+      if (ret == strValue.length()) {
+        return new Timestamp(mdt.getMillis());
+      }
+    }
+
+    // Otherwise try default timestamp parsing
+    return Timestamp.valueOf(strValue);
+  }
+
+  /**
+   * DateTimeParser to parse the date string as the millis since Unix epoch
+   */
+  public static class MillisDateFormatParser implements DateTimeParser {
+    private static final ThreadLocal<Matcher> numericMatcher = new ThreadLocal<Matcher>() {
+      @Override
+      protected Matcher initialValue() {
+        return Pattern.compile("(-?\\d+)(\\.\\d+)?$").matcher("");
+      }
+    };
+
+    private final static DateTimeFieldType[] dateTimeFields = {
+      DateTimeFieldType.year(),
+      DateTimeFieldType.monthOfYear(),
+      DateTimeFieldType.dayOfMonth(),
+      DateTimeFieldType.hourOfDay(),
+      DateTimeFieldType.minuteOfHour(),
+      DateTimeFieldType.secondOfMinute(),
+      DateTimeFieldType.millisOfSecond()
+    };
+
+    public int estimateParsedLength() {
+      return 13; // Shouldn't hit 14 digits until year 2286
+    }
+
+    public int parseInto(DateTimeParserBucket bucket, String text, int position) {
+      String substr = text.substring(position);
+      Matcher matcher = numericMatcher.get();
+      matcher.reset(substr);
+      if (!matcher.matches()) {
+        return -1;
+      }
+
+      // Joda DateTime only has precision to millis, cut off any fractional portion
+      long millis = Long.parseLong(matcher.group(1));
+      DateTime dt = new DateTime(millis);
+      for (DateTimeFieldType field : dateTimeFields) {
+        bucket.saveField(field, dt.get(field));
+      }
+      return substr.length();
+    }
+  }
+}
diff --git a/common/src/test/org/apache/hive/common/util/TestHiveStringUtils.java b/common/src/test/org/apache/hive/common/util/TestHiveStringUtils.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.common.util;
+
+import static org.junit.Assert.*;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+public class TestHiveStringUtils {
+  @Test
+  public void testSplitAndUnEscape() throws Exception {
+    splitAndUnEscapeTestCase(
+        null, null);
+
+    splitAndUnEscapeTestCase(
+        "'single element'",
+        new String[] {
+            "'single element'"
+        });
+
+    splitAndUnEscapeTestCase(
+        "yyyy-MM-dd'T'HH:mm:ss,yyyy-MM-dd'T'HH:mm:ss.S",
+        new String[] {
+            "yyyy-MM-dd'T'HH:mm:ss",
+            "yyyy-MM-dd'T'HH:mm:ss.S"
+        });
+
+    splitAndUnEscapeTestCase(
+        "single\\,element",
+        new String[] {
+            "single,element"
+        });
+    splitAndUnEscapeTestCase(
+        "element\\,one\\\\,element\\\\two\\\\\\,",
+        new String[] {
+            "element,one\\",
+            "element\\two\\,"
+        });
+  }
+
+  public void splitAndUnEscapeTestCase(String testValue, String[] expectedResults) throws Exception {
+    String[] testResults = HiveStringUtils.splitAndUnEscape(testValue);
+    assertTrue(Arrays.toString(expectedResults) + " == " + Arrays.toString(testResults),
+        Arrays.equals(expectedResults, testResults));
+  }
+}