You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ro...@apache.org on 2013/06/13 15:03:21 UTC

svn commit: r1492658 - in /pig/trunk: ./ src/docs/src/documentation/content/xdocs/ src/org/apache/pig/ src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/ src/org/apache/pig/builtin/ src/org/apache/pig/data/ test/org/ap...

Author: rohini
Date: Thu Jun 13 13:03:20 2013
New Revision: 1492658

URL: http://svn.apache.org/r1492658
Log:
PIG-3341: Strict datetime parsing and improve performance of loading datetime values (rohini)

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
    pig/trunk/src/org/apache/pig/Main.java
    pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POCast.java
    pig/trunk/src/org/apache/pig/builtin/ToDate.java
    pig/trunk/src/org/apache/pig/builtin/ToDateISO.java
    pig/trunk/src/org/apache/pig/builtin/ToMilliSeconds.java
    pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java
    pig/trunk/src/org/apache/pig/data/DataType.java
    pig/trunk/test/org/apache/pig/test/TestDefaultDateTimeZone.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Thu Jun 13 13:03:20 2013
@@ -385,6 +385,8 @@ PIG-2769: a simple logic causes very lon
 
 BUG FIXES
 
+PIG-3341: Strict datetime parsing and improve performance of loading datetime values (rohini)
+
 PIG-3329: RANK operator failed when working with SPLIT (xalan via cheolsoo)
 
 PIG-3345: Handle null in DateTime functions (rohini)

Modified: pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/func.xml?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/func.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/func.xml Thu Jun 13 13:03:20 2013
@@ -4138,7 +4138,7 @@ Use the SecondsBetween function to get t
 </section>
 
 <!-- ======================================================== -->  
- <section id="subtract-duratioin">
+ <section id="subtract-duration">
    <title>SubtractDuration</title>
    <p>Returns the result of a DateTime object minus a <a href="http://en.wikipedia.org/wiki/ISO_8601#Durations">Duration object</a>.</p>
 
@@ -4257,6 +4257,89 @@ Use the ToDate function to generate a Da
 </section>
 </section>
 
+<!-- ======================================================== -->
+<section id="to-milli-seconds">
+    <title>ToMilliSeconds</title>
+    <p>
+        Returns the number of milliseconds elapsed since January 1, 1970, 00:00:00.000 GMT
+        for a DateTime object.
+    </p>
+
+    <section>
+        <title>Syntax</title>
+        <table>
+            <tr>
+                <td>
+                    <p>ToMilliSeconds(datetime)</p>
+                </td>
+            </tr>
+        </table>
+    </section>
+
+    <section>
+        <title>Terms</title>
+        <table>
+            <tr>
+                <td>
+                    <p>datetime</p>
+                </td>
+                <td>
+                    <p>A datetime object.</p>
+                </td>
+            </tr>
+        </table>
+    </section>
+
+    <section>
+        <title>Usage</title>
+        <p>
+            Use the ToMilliSeconds function to convert the DateTime to the number of
+            milliseconds that have passed since January 1, 1970 00:00:00.000 GMT.
+        </p>
+    </section>
+</section>
+
+<!-- ======================================================== -->
+<section id="to-unix-time">
+    <title>ToUnixTime</title>
+    <p>
+        Returns the Unix Time as long for a DateTime object. UnixTime is the
+        number of seconds elapsed since January 1, 1970, 00:00:00.000 GMT.
+    </p>
+
+    <section>
+        <title>Syntax</title>
+        <table>
+            <tr>
+                <td>
+                    <p>ToUnixTime(datetime)</p>
+                </td>
+            </tr>
+        </table>
+    </section>
+
+    <section>
+        <title>Terms</title>
+        <table>
+            <tr>
+                <td>
+                    <p>datetime</p>
+                </td>
+                <td>
+                    <p>A datetime object.</p>
+                </td>
+            </tr>
+        </table>
+    </section>
+
+    <section>
+        <title>Usage</title>
+        <p>
+            Use the ToUnixTime function to convert the DateTime to Unix Time.
+        </p>
+    </section>
+</section>
+
 <!-- ======================================================== -->  
  <section id="weeks-between">
    <title>WeeksBetween</title>

Modified: pig/trunk/src/org/apache/pig/Main.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/Main.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/Main.java (original)
+++ pig/trunk/src/org/apache/pig/Main.java Thu Jun 13 13:03:20 2013
@@ -914,6 +914,8 @@ public static void printProperties(){
         System.out.println("        pig.additional.jars=<colon seperated list of jars>. Used in place of register command.");
         System.out.println("        udf.import.list=<comma seperated list of imports>. Used to avoid package names in UDF.");
         System.out.println("        stop.on.failure=true|false; default is false. Set to true to terminate on the first error.");
+        System.out.println("        pig.datetime.default.tz=<UTC time offset>. e.g. +08:00. Default is the default timezone of the host.");
+        System.out.println("            Determines the timezone used to handle datetime datatype and UDFs. ");
 	System.out.println("Additionally, any Hadoop property can be specified.");
 }
 

Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POCast.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POCast.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POCast.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POCast.java Thu Jun 13 13:03:20 2013
@@ -53,7 +53,6 @@ import org.apache.pig.impl.plan.VisitorE
 import org.apache.pig.impl.util.CastUtils;
 import org.apache.pig.impl.util.LogUtils;
 import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
 
 /**
  * This is just a cast that converts DataByteArray into either String or
@@ -1071,15 +1070,9 @@ public class POCast extends ExpressionOp
             return in.getNextDateTime();
 
         case DataType.CHARARRAY: {
-            String str = null;
             Result res = in.getNextString();
             if (res.returnStatus == POStatus.STATUS_OK && res.result != null) {
-                DateTimeZone dtz = ToDate.extractDateTimeZone((String) res.result);
-                if (dtz == null) {
-                    res.result = new DateTime((String) res.result);
-                } else {
-                    res.result = new DateTime((String) res.result, dtz);
-                }
+                res.result = ToDate.extractDateTime((String) res.result);
             }
             return res;
         }
@@ -1658,12 +1651,7 @@ public class POCast extends ExpressionOp
                 result = (DateTime)obj;
                 break;
             case DataType.CHARARRAY:
-                DateTimeZone dtz = ToDate.extractDateTimeZone((String) obj);
-                if (dtz == null) {
-                    result = new DateTime((String) obj);
-                } else {
-                    result = new DateTime((String) obj, dtz);
-                }
+                result = ToDate.extractDateTime((String) obj);
                 break;
             case DataType.BIGINTEGER:
                 result = new DateTime(((BigInteger)obj).longValue());

Modified: pig/trunk/src/org/apache/pig/builtin/ToDate.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/ToDate.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/ToDate.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/ToDate.java Thu Jun 13 13:03:20 2013
@@ -21,8 +21,6 @@ package org.apache.pig.builtin;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.pig.EvalFunc;
 import org.apache.pig.FuncSpec;
@@ -32,12 +30,14 @@ import org.apache.pig.impl.logicalLayer.
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.joda.time.DateTime;
 import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.ISODateTimeFormat;
 
 /**
- * 
+ *
  * <p>ToDate converts the ISO or the customized string or the Unix timestamp to the DateTime object.</p>
  * <p>ToDate is overloaded.</p>
- * 
+ *
  * <dl>
  * <dt><b>Syntax:</b></dt>
  * <dd><code>DateTime ToDate(Long millis)</code>.</dd>
@@ -55,7 +55,7 @@ import org.joda.time.DateTimeZone;
  * <dt><b>Output:</b></dt>
  * <dd><code>the DateTime object</code>.</dd>
  * </dl>
- * 
+ *
  * <dl>
  * <dt><b>Syntax:</b></dt>
  * <dd><code>DateTime ToDate(String dtStr, String format)</code>.</dd>
@@ -78,8 +78,9 @@ import org.joda.time.DateTimeZone;
  * </dl>
  */
 public class ToDate extends EvalFunc<DateTime> {
-    
-    private static final Pattern TIMEZONE_PATTERN = Pattern.compile("(Z|(?<=(T[0-9\\.:]{0,12}))((\\+|-)\\d{2}(:?\\d{2})?))$");
+
+    private static final DateTimeFormatter isoDateTimeFormatter = ISODateTimeFormat
+            .dateOptionalTimeParser().withOffsetParsed();
 
     public DateTime exec(Tuple input) throws IOException {
         if (input == null || input.size() < 1 || input.get(0) == null) {
@@ -114,18 +115,12 @@ public class ToDate extends EvalFunc<Dat
         funcList.add(new FuncSpec(ToDate3ARGS.class.getName(), s));
         return funcList;
     }
-    
+
     public static DateTimeZone extractDateTimeZone(String dtStr) {
-        Matcher matcher = TIMEZONE_PATTERN.matcher(dtStr);
-        if (matcher.find()) {
-            String dtzStr = matcher.group();
-            if (dtzStr.equals("Z")) {
-                return DateTimeZone.forOffsetMillis(DateTimeZone.UTC.getOffset(null));
-            } else {
-                return DateTimeZone.forOffsetMillis(DateTimeZone.forID(dtzStr).getOffset(null));
-            }
-        } else {
-            return null;
-        }
+        return isoDateTimeFormatter.parseDateTime(dtStr).getZone();
+    }
+
+    public static DateTime extractDateTime(String dtStr) {
+        return isoDateTimeFormatter.parseDateTime(dtStr);
     }
 }

Modified: pig/trunk/src/org/apache/pig/builtin/ToDateISO.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/ToDateISO.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/ToDateISO.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/ToDateISO.java Thu Jun 13 13:03:20 2013
@@ -24,7 +24,6 @@ import org.apache.pig.EvalFunc;
 import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
 import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
 
 /**
  * This method should never be used directly, use {@link ToDate}.
@@ -36,12 +35,7 @@ public class ToDateISO extends EvalFunc<
             return null;
         }
         String dtStr = DataType.toString(input.get(0));
-        DateTimeZone dtz = ToDate.extractDateTimeZone(dtStr);
-        if (dtz == null) {
-            return new DateTime(dtStr);
-        } else {
-            return new DateTime(dtStr, dtz);
-        }
+        return ToDate.extractDateTime(dtStr);
     }
 
 }

Modified: pig/trunk/src/org/apache/pig/builtin/ToMilliSeconds.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/ToMilliSeconds.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/ToMilliSeconds.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/ToMilliSeconds.java Thu Jun 13 13:03:20 2013
@@ -31,7 +31,7 @@ import org.joda.time.DateTime;
 
 /**
  * <p>
- * ToUnixTime converts the DateTime to the number of milliseconds that have passed
+ * ToMilliSeconds converts the DateTime to the number of milliseconds that have passed
  * since January 1, 1970 00:00:00.000 GMT.
  * </p>
  * <ul>

Modified: pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java (original)
+++ pig/trunk/src/org/apache/pig/builtin/Utf8StorageConverter.java Thu Jun 13 13:03:20 2013
@@ -43,7 +43,6 @@ import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.impl.util.LogUtils;
 import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
 
 /**
  * This abstract class provides standard conversions between utf8 encoded data
@@ -523,12 +522,7 @@ public class Utf8StorageConverter implem
         }
         try {
             String dtStr = new String(b);
-            DateTimeZone dtz = ToDate.extractDateTimeZone(dtStr);
-            if (dtz == null) {
-                return new DateTime(dtStr);
-            } else {
-                return new DateTime(dtStr, dtz);
-            }
+            return ToDate.extractDateTime(dtStr);
         } catch (IllegalArgumentException e) {
             LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                     "converted to datetime, caught IllegalArgumentException <" +

Modified: pig/trunk/src/org/apache/pig/data/DataType.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DataType.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/data/DataType.java (original)
+++ pig/trunk/src/org/apache/pig/data/DataType.java Thu Jun 13 13:03:20 2013
@@ -39,8 +39,6 @@ import org.apache.pig.impl.logicalLayer.
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException;
 import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-
 
 
 /**
@@ -1075,12 +1073,7 @@ public class DataType {
                 return new DateTime(((DataByteArray) o).toString());
             case CHARARRAY:
                 // the string can contain just date part or date part plus time part
-                DateTimeZone dtz = ToDate.extractDateTimeZone((String) o);
-                if (dtz == null) {
-                    return new DateTime((String) o);
-                } else {
-                    return new DateTime((String) o, dtz);
-                }
+                return ToDate.extractDateTime((String) o);
             case INTEGER:
                 return new DateTime(((Integer) o).longValue());
             case LONG:

Modified: pig/trunk/test/org/apache/pig/test/TestDefaultDateTimeZone.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestDefaultDateTimeZone.java?rev=1492658&r1=1492657&r2=1492658&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestDefaultDateTimeZone.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestDefaultDateTimeZone.java Thu Jun 13 13:03:20 2013
@@ -79,9 +79,9 @@ public class TestDefaultDateTimeZone ext
             Tuple actualTuple = actualItr.next();
             assertEquals(expectedTuple, actualTuple);
         }
-        assertEquals(expectedItr.hasNext(), actualItr.hasNext()); 
+        assertEquals(expectedItr.hasNext(), actualItr.hasNext());
     }
-    
+
     private static Iterator<Tuple> generateExpectedResults(DateTimeZone dtz)
             throws Exception {
         List<Tuple> expectedResults = new ArrayList<Tuple>();
@@ -96,9 +96,12 @@ public class TestDefaultDateTimeZone ext
         return expectedResults.iterator();
     }
 
-    // PIG-3316
-    public void testTimeZoneExtracting() throws IOException {
-        String[] inputs = { 
+    public void testTimeZone() throws IOException {
+        // Usually set through "pig.datetime.default.tz"
+        String defaultDTZ = "+03:00";
+        DateTimeZone.setDefault(DateTimeZone.forOffsetMillis(DateTimeZone.forID(defaultDTZ)
+                .getOffset(null)));
+        String[] inputs = {
                 "1970-01-01T00:00:00.000-08:00",
                 "1970-01-01T00:00",
                 "1970-01-01T00",
@@ -109,34 +112,50 @@ public class TestDefaultDateTimeZone ext
                 "1970-01-01T00-05:00",
                 "1970-01-01T-08:00",
                 "1970-01T-08:00",
-                "1970T+8:00",
+                //"1970T+8:00", //Invalid format
                 "1970-01-01",
                 "1970-01",
                 "1970",
         };
-        String[] expectedOutputs = {
-        		"-08:00",
-        		"null",
-        		"null",
-        		"null",
-        		"null",
-        		"null",
-        		"-08:00",
-        		"-05:00",
-        		"-08:00",
-        		"-08:00",
-        		"null",
-        		"null",
-        		"null",
-        		"null"
+        String[] expectedDTZOutputs = {
+                "-08:00",
+                defaultDTZ,
+                defaultDTZ,
+                defaultDTZ,
+                defaultDTZ,
+                defaultDTZ,
+                "-08:00",
+                "-05:00",
+                "-08:00",
+                "-08:00",
+                defaultDTZ,
+                defaultDTZ,
+                defaultDTZ
+        };
+        String[] expectedDTOutputs = {
+                "1970-01-01T00:00:00.000-08:00",
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000-08:00",
+                "1970-01-01T00:00:00.000-05:00",
+                "1970-01-01T00:00:00.000-08:00",
+                "1970-01-01T00:00:00.000-08:00",
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ,
+                "1970-01-01T00:00:00.000" + defaultDTZ
         };
-        
+
         for( int i = 0; i < inputs.length; i++ ) {
             DateTimeZone dtz = ToDate.extractDateTimeZone( inputs[i] );
-            assertEquals( expectedOutputs[i], dtz == null ? "null" : dtz.toString() );
-            System.out.println( "\"" + dtz + "\"," );
+            assertEquals( expectedDTZOutputs[i], dtz.toString() );
+            DateTime dt = ToDate.extractDateTime( inputs[i] );
+            assertEquals( expectedDTOutputs[i], dt.toString() );
+            System.out.println( "\"" + dt + "\"," );
         }
-        
+
     }
 
 }