You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by vo...@apache.org on 2019/04/01 06:36:02 UTC

[drill] 03/04: DRILL-7077: Add Function to Facilitate Time Series Analysis

This is an automated email from the ASF dual-hosted git repository.

volodymyr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git

commit 3c798d338f4f4dab6713956e0a94d18e6e5c72bd
Author: Charles S. Givre <cg...@gmail.com>
AuthorDate: Mon Mar 25 13:45:01 2019 -0400

    DRILL-7077: Add Function to Facilitate Time Series Analysis
    
    closes #1680
---
 .../drill/exec/udfs/NearestDateFunctions.java      | 147 ++++++++++++++++++++
 .../apache/drill/exec/udfs/NearestDateUtils.java   | 151 ++++++++++++++++++++
 .../drill/exec/udfs/TestNearestDateFunctions.java  | 153 +++++++++++++++++++++
 3 files changed, 451 insertions(+)

diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java
new file mode 100644
index 0000000..f61a216
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.holders.TimeStampHolder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+public class NearestDateFunctions {
+
+  /**
+   * This function takes two arguments, an input date object, and an interval and returns
+   * the previous date that is the first date in that period.  This function is intended to be used in time series analysis to
+   * aggregate by various units of time.
+   * Usage is:
+   * <p>
+   * SELECT <date_field>, COUNT(*) AS event_count
+   * FROM ...
+   * GROUP BY nearestDate(`date_field`, 'QUARTER')
+   * <p>
+   * Currently supports the following time units:
+   * <p>
+   * YEAR
+   * QUARTER
+   * MONTH
+   * WEEK_SUNDAY
+   * WEEK_MONDAY
+   * DAY
+   * HOUR
+   * HALF_HOUR
+   * QUARTER_HOUR
+   * MINUTE
+   * 30SECOND
+   * 15SECOND
+   * SECOND
+   */
+  @FunctionTemplate(name = "nearestDate",
+          scope = FunctionTemplate.FunctionScope.SIMPLE,
+          nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class NearestDateFunction implements DrillSimpleFunc {
+
+    @Param
+    TimeStampHolder inputDate;
+
+    @Param
+    VarCharHolder interval;
+
+    @Output
+    TimeStampHolder out;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+
+      String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(interval.start, interval.end, interval.buffer);
+      java.time.LocalDateTime ld = java.time.LocalDateTime.ofInstant(java.time.Instant.ofEpochMilli(inputDate.value), java.time.ZoneId.of("UTC"));
+
+      java.time.LocalDateTime td = org.apache.drill.exec.udfs.NearestDateUtils.getDate(ld, input);
+      out.value = td.atZone(java.time.ZoneId.of("UTC")).toInstant().toEpochMilli();
+    }
+  }
+
+  /**
+   * This function takes three arguments, an input date string, an input date format string, and an interval and returns
+   * the previous date that is the first date in that period.  This function is intended to be used in time series analysis to
+   * aggregate by various units of time.
+   * Usage is:
+   * <p>
+   * SELECT <date_field>, COUNT(*) AS event_count
+   * FROM ...
+   * GROUP BY nearestDate(`date_field`, 'yyyy-mm-dd', 'QUARTER')
+   * <p>
+   * Currently supports the following time units:
+   * <p>
+   * YEAR
+   * QUARTER
+   * MONTH
+   * WEEK_SUNDAY
+   * WEEK_MONDAY
+   * DAY
+   * HOUR
+   * HALF_HOUR
+   * QUARTER_HOUR
+   * MINUTE
+   * 30SECOND
+   * 15SECOND
+   * SECOND
+   */
+  @FunctionTemplate(name = "nearestDate",
+          scope = FunctionTemplate.FunctionScope.SIMPLE,
+          nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+  public static class NearestDateFunctionWithString implements DrillSimpleFunc {
+
+    @Param
+    VarCharHolder input;
+
+    @Param
+    VarCharHolder formatString;
+
+    @Param
+    VarCharHolder interval;
+
+    @Output
+    TimeStampHolder out;
+
+    @Override
+    public void setup() {
+    }
+
+    @Override
+    public void eval() {
+      String inputDate = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(input.start, input.end, input.buffer);
+
+      String format = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(formatString.start, formatString.end, formatString.buffer);
+
+      String intervalString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(interval.start, interval.end, interval.buffer);
+
+      java.time.format.DateTimeFormatter formatter = java.time.format.DateTimeFormatter.ofPattern(format);
+      java.time.LocalDateTime dateTime = java.time.LocalDateTime.parse(inputDate, formatter);
+
+      java.time.LocalDateTime td = org.apache.drill.exec.udfs.NearestDateUtils.getDate(dateTime, intervalString);
+      out.value = td.atZone(java.time.ZoneId.of("UTC")).toInstant().toEpochMilli();
+    }
+  }
+}
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java
new file mode 100644
index 0000000..7d66f3b
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.temporal.TemporalAdjusters;
+import java.time.LocalDateTime;
+import java.time.DayOfWeek;
+import java.time.temporal.ChronoUnit;
+import java.util.Arrays;
+
+public class NearestDateUtils {
+  /**
+   * Specifies the time grouping to be used with the nearest date function
+   */
+  protected enum TimeInterval {
+    YEAR,
+    QUARTER,
+    MONTH,
+    WEEK_SUNDAY,
+    WEEK_MONDAY,
+    DAY,
+    HOUR,
+    HALF_HOUR,
+    QUARTER_HOUR,
+    MINUTE,
+    HALF_MINUTE,
+    QUARTER_MINUTE,
+    SECOND
+  }
+
+  private static final Logger logger = LoggerFactory.getLogger(NearestDateUtils.class);
+
+  /**
+   * This function takes a Java LocalDateTime object, and an interval string and returns
+   * the nearest date closets to that time.  For instance, if you specified the date as 2018-05-04 and YEAR, the function
+   * will return 2018-01-01
+   *
+   * @param d        the original datetime before adjustments
+   * @param interval The interval string to deduct from the supplied date
+   * @return the modified LocalDateTime
+   */
+  public final static java.time.LocalDateTime getDate(java.time.LocalDateTime d, String interval) {
+    java.time.LocalDateTime newDate = d;
+    int year = d.getYear();
+    int month = d.getMonth().getValue();
+    int day = d.getDayOfMonth();
+    int hour = d.getHour();
+    int minute = d.getMinute();
+    int second = d.getSecond();
+    TimeInterval adjustmentAmount;
+    try {
+      adjustmentAmount = TimeInterval.valueOf(interval.toUpperCase());
+    } catch (IllegalArgumentException e) {
+      throw new DrillRuntimeException(String.format("[%s] is not a valid time statement. Expecting: %s", interval, Arrays.asList(TimeInterval.values())));
+    }
+    switch (adjustmentAmount) {
+      case YEAR:
+        newDate = LocalDateTime.of(year, 1, 1, 0, 0, 0);
+        break;
+      case QUARTER:
+        newDate = LocalDateTime.of(year, (month / 3) * 3 + 1, 1, 0, 0, 0);
+        break;
+      case MONTH:
+        newDate = LocalDateTime.of(year, month, 1, 0, 0, 0);
+        break;
+      case WEEK_SUNDAY:
+        newDate = newDate.with(TemporalAdjusters.previousOrSame(DayOfWeek.SUNDAY))
+                .truncatedTo(ChronoUnit.DAYS);
+        break;
+      case WEEK_MONDAY:
+        newDate = newDate.with(TemporalAdjusters.previousOrSame(DayOfWeek.MONDAY))
+                .truncatedTo(ChronoUnit.DAYS);
+        break;
+      case DAY:
+        newDate = LocalDateTime.of(year, month, day, 0, 0, 0);
+        break;
+      case HOUR:
+        newDate = LocalDateTime.of(year, month, day, hour, 0, 0);
+        break;
+      case HALF_HOUR:
+        if (minute >= 30) {
+          minute = 30;
+        } else {
+          minute = 0;
+        }
+        newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+        break;
+      case QUARTER_HOUR:
+        if (minute >= 45) {
+          minute = 45;
+        } else if (minute >= 30) {
+          minute = 30;
+        } else if (minute >= 15) {
+          minute = 15;
+        } else {
+          minute = 0;
+        }
+        newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+        break;
+      case MINUTE:
+        newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+        break;
+      case HALF_MINUTE:
+        if (second >= 30) {
+          second = 30;
+        } else {
+          second = 0;
+        }
+        newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+        break;
+      case QUARTER_MINUTE:
+        if (second >= 45) {
+          second = 45;
+        } else if (second >= 30) {
+          second = 30;
+        } else if (second >= 15) {
+          second = 15;
+        } else {
+          second = 0;
+        }
+        newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+        break;
+      case SECOND:
+        newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+        break;
+      default:
+        throw new DrillRuntimeException(String.format("[%s] is not a valid time statement. Expecting: %s", interval, Arrays.asList(TimeInterval.values())));
+    }
+    return newDate;
+  }
+}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java
new file mode 100644
index 0000000..490db83
--- /dev/null
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import java.util.Arrays;
+
+import java.time.LocalDateTime;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestNearestDateFunctions extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  @Test
+  public void testNearestDate() throws Exception {
+    String query = "SELECT nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'YEAR') AS nearest_year, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER') AS nearest_quarter, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'MONTH') AS nearest_month, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'DAY') AS nearest_day, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'WEEK_SUNDAY') AS nearest_week_sunday, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'WEEK_MONDAY') AS nearest_week_monday, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'HOUR') AS nearest_hour, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:42:00', 'yyyy-MM-dd HH:mm:ss'), 'HALF_HOUR') AS nearest_half_hour, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:48:00', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER_HOUR') AS nearest_quarter_hour, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'MINUTE') AS nearest_minute, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss'), 'HALF_MINUTE') AS nearest_30second, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER_MINUTE') AS nearest_15second, " +
+            "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:31', 'yyyy-MM-dd HH:mm:ss'), 'SECOND') AS nearest_second " +
+            "FROM (VALUES(1))";
+    testBuilder()
+            .sqlQuery(query)
+            .unOrdered()
+            .baselineColumns("nearest_year",
+                    "nearest_quarter",
+                    "nearest_month",
+                    "nearest_day",
+                    "nearest_week_sunday",
+                    "nearest_week_monday",
+                    "nearest_hour",
+                    "nearest_half_hour",
+                    "nearest_quarter_hour",
+                    "nearest_minute",
+                    "nearest_30second",
+                    "nearest_15second",
+                    "nearest_second")
+            .baselineValues(LocalDateTime.of(2019, 1, 1, 0, 0, 0),  //Year
+                    LocalDateTime.of(2019, 1, 1, 0, 0, 0),  //Quarter
+                    LocalDateTime.of(2019, 2, 1, 0, 0, 0), //Month
+                    LocalDateTime.of(2019, 2, 15, 0, 0, 0), //Day
+                    LocalDateTime.of(2019, 2, 10, 0, 0, 0), //Week Sunday
+                    LocalDateTime.of(2019, 2, 11, 0, 0, 0), //Week Monday
+                    LocalDateTime.of(2019, 2, 15, 7, 0, 0), //Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 30, 0), //Half Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 45, 0), //Quarter Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 0), //Minute
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 0), //30Second
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 15), //15Second
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 31)) //Second
+            .go();
+  }
+
+  @Test
+  public void testNearestDateWithTimestamp() throws Exception {
+    String query = "SELECT nearestDate( '2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'YEAR') AS nearest_year, " +
+            "nearestDate( '2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'QUARTER') AS nearest_quarter, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'MONTH') AS nearest_month, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'DAY') AS nearest_day, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'WEEK_SUNDAY') AS nearest_week_sunday, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'WEEK_MONDAY') AS nearest_week_monday, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'HOUR') AS nearest_hour, " +
+            "nearestDate( '2019-02-15 07:42:00', 'yyyy-MM-dd HH:mm:ss', 'HALF_HOUR') AS nearest_half_hour, " +
+            "nearestDate( '2019-02-15 07:48:00', 'yyyy-MM-dd HH:mm:ss', 'QUARTER_HOUR') AS nearest_quarter_hour, " +
+            "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'MINUTE') AS nearest_minute, " +
+            "nearestDate( '2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss', 'HALF_MINUTE') AS nearest_30second, " +
+            "nearestDate( '2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss', 'QUARTER_MINUTE') AS nearest_15second, " +
+            "nearestDate( '2019-02-15 07:22:31', 'yyyy-MM-dd HH:mm:ss', 'SECOND') AS nearest_second " +
+            "FROM (VALUES(1))";
+    testBuilder()
+            .sqlQuery(query)
+            .unOrdered()
+            .baselineColumns("nearest_year",
+                    "nearest_quarter",
+                    "nearest_month",
+                    "nearest_day",
+                    "nearest_week_sunday",
+                    "nearest_week_monday",
+                    "nearest_hour",
+                    "nearest_half_hour",
+                    "nearest_quarter_hour",
+                    "nearest_minute",
+                    "nearest_30second",
+                    "nearest_15second",
+                    "nearest_second")
+            .baselineValues(LocalDateTime.of(2019, 1, 1, 0, 0, 0),  //Year
+                    LocalDateTime.of(2019, 1, 1, 0, 0, 0),  //Quarter
+                    LocalDateTime.of(2019, 2, 1, 0, 0, 0), //Month
+                    LocalDateTime.of(2019, 2, 15, 0, 0, 0), //Day
+                    LocalDateTime.of(2019, 2, 10, 0, 0, 0), //Week Sunday
+                    LocalDateTime.of(2019, 2, 11, 0, 0, 0), //Week Monday
+                    LocalDateTime.of(2019, 2, 15, 7, 0, 0), //Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 30, 0), //Half Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 45, 0), //Quarter Hour
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 0), //Minute
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 0), //30Second
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 15), //15Second
+                    LocalDateTime.of(2019, 2, 15, 7, 22, 31)) //Second
+            .go();
+  }
+
+  @Test
+  public void testReadException() throws Exception {
+    String query = "SELECT nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'BAD_DATE') AS nearest_year " +
+            "FROM (VALUES(1))";
+    try {
+      run(query);
+      fail();
+    } catch (DrillRuntimeException e) {
+      assertTrue(e.getMessage().contains("[BAD_DATE] is not a valid time statement. Expecting: " + Arrays.asList(NearestDateUtils.TimeInterval.values())));
+    }
+  }
+}