You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by vo...@apache.org on 2019/04/01 06:36:02 UTC
[drill] 03/04: DRILL-7077: Add Function to Facilitate Time Series
Analysis
This is an automated email from the ASF dual-hosted git repository.
volodymyr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
commit 3c798d338f4f4dab6713956e0a94d18e6e5c72bd
Author: Charles S. Givre <cg...@gmail.com>
AuthorDate: Mon Mar 25 13:45:01 2019 -0400
DRILL-7077: Add Function to Facilitate Time Series Analysis
closes #1680
---
.../drill/exec/udfs/NearestDateFunctions.java | 147 ++++++++++++++++++++
.../apache/drill/exec/udfs/NearestDateUtils.java | 151 ++++++++++++++++++++
.../drill/exec/udfs/TestNearestDateFunctions.java | 153 +++++++++++++++++++++
3 files changed, 451 insertions(+)
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java
new file mode 100644
index 0000000..f61a216
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateFunctions.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.holders.TimeStampHolder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+
+public class NearestDateFunctions {
+
+ /**
+ * This function takes two arguments, an input date object, and an interval and returns
+ * the previous date that is the first date in that period. This function is intended to be used in time series analysis to
+ * aggregate by various units of time.
+ * Usage is:
+ * <p>
+ * SELECT <date_field>, COUNT(*) AS event_count
+ * FROM ...
+ * GROUP BY nearestDate(`date_field`, 'QUARTER')
+ * <p>
+ * Currently supports the following time units:
+ * <p>
+ * YEAR
+ * QUARTER
+ * MONTH
+ * WEEK_SUNDAY
+ * WEEK_MONDAY
+ * DAY
+ * HOUR
+ * HALF_HOUR
+ * QUARTER_HOUR
+ * MINUTE
+ * 30SECOND
+ * 15SECOND
+ * SECOND
+ */
+ @FunctionTemplate(name = "nearestDate",
+ scope = FunctionTemplate.FunctionScope.SIMPLE,
+ nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class NearestDateFunction implements DrillSimpleFunc {
+
+ @Param
+ TimeStampHolder inputDate;
+
+ @Param
+ VarCharHolder interval;
+
+ @Output
+ TimeStampHolder out;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+
+ String input = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(interval.start, interval.end, interval.buffer);
+ java.time.LocalDateTime ld = java.time.LocalDateTime.ofInstant(java.time.Instant.ofEpochMilli(inputDate.value), java.time.ZoneId.of("UTC"));
+
+ java.time.LocalDateTime td = org.apache.drill.exec.udfs.NearestDateUtils.getDate(ld, input);
+ out.value = td.atZone(java.time.ZoneId.of("UTC")).toInstant().toEpochMilli();
+ }
+ }
+
+ /**
+ * This function takes three arguments, an input date string, an input date format string, and an interval and returns
+ * the previous date that is the first date in that period. This function is intended to be used in time series analysis to
+ * aggregate by various units of time.
+ * Usage is:
+ * <p>
+ * SELECT <date_field>, COUNT(*) AS event_count
+ * FROM ...
+ * GROUP BY nearestDate(`date_field`, 'yyyy-mm-dd', 'QUARTER')
+ * <p>
+ * Currently supports the following time units:
+ * <p>
+ * YEAR
+ * QUARTER
+ * MONTH
+ * WEEK_SUNDAY
+ * WEEK_MONDAY
+ * DAY
+ * HOUR
+ * HALF_HOUR
+ * QUARTER_HOUR
+ * MINUTE
+ * 30SECOND
+ * 15SECOND
+ * SECOND
+ */
+ @FunctionTemplate(name = "nearestDate",
+ scope = FunctionTemplate.FunctionScope.SIMPLE,
+ nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+ public static class NearestDateFunctionWithString implements DrillSimpleFunc {
+
+ @Param
+ VarCharHolder input;
+
+ @Param
+ VarCharHolder formatString;
+
+ @Param
+ VarCharHolder interval;
+
+ @Output
+ TimeStampHolder out;
+
+ @Override
+ public void setup() {
+ }
+
+ @Override
+ public void eval() {
+ String inputDate = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(input.start, input.end, input.buffer);
+
+ String format = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(formatString.start, formatString.end, formatString.buffer);
+
+ String intervalString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(interval.start, interval.end, interval.buffer);
+
+ java.time.format.DateTimeFormatter formatter = java.time.format.DateTimeFormatter.ofPattern(format);
+ java.time.LocalDateTime dateTime = java.time.LocalDateTime.parse(inputDate, formatter);
+
+ java.time.LocalDateTime td = org.apache.drill.exec.udfs.NearestDateUtils.getDate(dateTime, intervalString);
+ out.value = td.atZone(java.time.ZoneId.of("UTC")).toInstant().toEpochMilli();
+ }
+ }
+}
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java
new file mode 100644
index 0000000..7d66f3b
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/NearestDateUtils.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.temporal.TemporalAdjusters;
+import java.time.LocalDateTime;
+import java.time.DayOfWeek;
+import java.time.temporal.ChronoUnit;
+import java.util.Arrays;
+
+public class NearestDateUtils {
+ /**
+ * Specifies the time grouping to be used with the nearest date function
+ */
+ protected enum TimeInterval {
+ YEAR,
+ QUARTER,
+ MONTH,
+ WEEK_SUNDAY,
+ WEEK_MONDAY,
+ DAY,
+ HOUR,
+ HALF_HOUR,
+ QUARTER_HOUR,
+ MINUTE,
+ HALF_MINUTE,
+ QUARTER_MINUTE,
+ SECOND
+ }
+
+ private static final Logger logger = LoggerFactory.getLogger(NearestDateUtils.class);
+
+ /**
+ * This function takes a Java LocalDateTime object, and an interval string and returns
+ * the nearest date closets to that time. For instance, if you specified the date as 2018-05-04 and YEAR, the function
+ * will return 2018-01-01
+ *
+ * @param d the original datetime before adjustments
+ * @param interval The interval string to deduct from the supplied date
+ * @return the modified LocalDateTime
+ */
+ public final static java.time.LocalDateTime getDate(java.time.LocalDateTime d, String interval) {
+ java.time.LocalDateTime newDate = d;
+ int year = d.getYear();
+ int month = d.getMonth().getValue();
+ int day = d.getDayOfMonth();
+ int hour = d.getHour();
+ int minute = d.getMinute();
+ int second = d.getSecond();
+ TimeInterval adjustmentAmount;
+ try {
+ adjustmentAmount = TimeInterval.valueOf(interval.toUpperCase());
+ } catch (IllegalArgumentException e) {
+ throw new DrillRuntimeException(String.format("[%s] is not a valid time statement. Expecting: %s", interval, Arrays.asList(TimeInterval.values())));
+ }
+ switch (adjustmentAmount) {
+ case YEAR:
+ newDate = LocalDateTime.of(year, 1, 1, 0, 0, 0);
+ break;
+ case QUARTER:
+ newDate = LocalDateTime.of(year, (month / 3) * 3 + 1, 1, 0, 0, 0);
+ break;
+ case MONTH:
+ newDate = LocalDateTime.of(year, month, 1, 0, 0, 0);
+ break;
+ case WEEK_SUNDAY:
+ newDate = newDate.with(TemporalAdjusters.previousOrSame(DayOfWeek.SUNDAY))
+ .truncatedTo(ChronoUnit.DAYS);
+ break;
+ case WEEK_MONDAY:
+ newDate = newDate.with(TemporalAdjusters.previousOrSame(DayOfWeek.MONDAY))
+ .truncatedTo(ChronoUnit.DAYS);
+ break;
+ case DAY:
+ newDate = LocalDateTime.of(year, month, day, 0, 0, 0);
+ break;
+ case HOUR:
+ newDate = LocalDateTime.of(year, month, day, hour, 0, 0);
+ break;
+ case HALF_HOUR:
+ if (minute >= 30) {
+ minute = 30;
+ } else {
+ minute = 0;
+ }
+ newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+ break;
+ case QUARTER_HOUR:
+ if (minute >= 45) {
+ minute = 45;
+ } else if (minute >= 30) {
+ minute = 30;
+ } else if (minute >= 15) {
+ minute = 15;
+ } else {
+ minute = 0;
+ }
+ newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+ break;
+ case MINUTE:
+ newDate = LocalDateTime.of(year, month, day, hour, minute, 0);
+ break;
+ case HALF_MINUTE:
+ if (second >= 30) {
+ second = 30;
+ } else {
+ second = 0;
+ }
+ newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+ break;
+ case QUARTER_MINUTE:
+ if (second >= 45) {
+ second = 45;
+ } else if (second >= 30) {
+ second = 30;
+ } else if (second >= 15) {
+ second = 15;
+ } else {
+ second = 0;
+ }
+ newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+ break;
+ case SECOND:
+ newDate = LocalDateTime.of(year, month, day, hour, minute, second);
+ break;
+ default:
+ throw new DrillRuntimeException(String.format("[%s] is not a valid time statement. Expecting: %s", interval, Arrays.asList(TimeInterval.values())));
+ }
+ return newDate;
+ }
+}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java
new file mode 100644
index 0000000..490db83
--- /dev/null
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestNearestDateFunctions.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import java.util.Arrays;
+
+import java.time.LocalDateTime;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestNearestDateFunctions extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+ startCluster(builder);
+ }
+
+ @Test
+ public void testNearestDate() throws Exception {
+ String query = "SELECT nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'YEAR') AS nearest_year, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER') AS nearest_quarter, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'MONTH') AS nearest_month, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'DAY') AS nearest_day, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'WEEK_SUNDAY') AS nearest_week_sunday, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'WEEK_MONDAY') AS nearest_week_monday, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'HOUR') AS nearest_hour, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:42:00', 'yyyy-MM-dd HH:mm:ss'), 'HALF_HOUR') AS nearest_half_hour, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:48:00', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER_HOUR') AS nearest_quarter_hour, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'MINUTE') AS nearest_minute, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss'), 'HALF_MINUTE') AS nearest_30second, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss'), 'QUARTER_MINUTE') AS nearest_15second, " +
+ "nearestDate( TO_TIMESTAMP('2019-02-15 07:22:31', 'yyyy-MM-dd HH:mm:ss'), 'SECOND') AS nearest_second " +
+ "FROM (VALUES(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("nearest_year",
+ "nearest_quarter",
+ "nearest_month",
+ "nearest_day",
+ "nearest_week_sunday",
+ "nearest_week_monday",
+ "nearest_hour",
+ "nearest_half_hour",
+ "nearest_quarter_hour",
+ "nearest_minute",
+ "nearest_30second",
+ "nearest_15second",
+ "nearest_second")
+ .baselineValues(LocalDateTime.of(2019, 1, 1, 0, 0, 0), //Year
+ LocalDateTime.of(2019, 1, 1, 0, 0, 0), //Quarter
+ LocalDateTime.of(2019, 2, 1, 0, 0, 0), //Month
+ LocalDateTime.of(2019, 2, 15, 0, 0, 0), //Day
+ LocalDateTime.of(2019, 2, 10, 0, 0, 0), //Week Sunday
+ LocalDateTime.of(2019, 2, 11, 0, 0, 0), //Week Monday
+ LocalDateTime.of(2019, 2, 15, 7, 0, 0), //Hour
+ LocalDateTime.of(2019, 2, 15, 7, 30, 0), //Half Hour
+ LocalDateTime.of(2019, 2, 15, 7, 45, 0), //Quarter Hour
+ LocalDateTime.of(2019, 2, 15, 7, 22, 0), //Minute
+ LocalDateTime.of(2019, 2, 15, 7, 22, 0), //30Second
+ LocalDateTime.of(2019, 2, 15, 7, 22, 15), //15Second
+ LocalDateTime.of(2019, 2, 15, 7, 22, 31)) //Second
+ .go();
+ }
+
+ @Test
+ public void testNearestDateWithTimestamp() throws Exception {
+ String query = "SELECT nearestDate( '2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'YEAR') AS nearest_year, " +
+ "nearestDate( '2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'QUARTER') AS nearest_quarter, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'MONTH') AS nearest_month, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'DAY') AS nearest_day, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'WEEK_SUNDAY') AS nearest_week_sunday, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'WEEK_MONDAY') AS nearest_week_monday, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'HOUR') AS nearest_hour, " +
+ "nearestDate( '2019-02-15 07:42:00', 'yyyy-MM-dd HH:mm:ss', 'HALF_HOUR') AS nearest_half_hour, " +
+ "nearestDate( '2019-02-15 07:48:00', 'yyyy-MM-dd HH:mm:ss', 'QUARTER_HOUR') AS nearest_quarter_hour, " +
+ "nearestDate( '2019-02-15 07:22:00', 'yyyy-MM-dd HH:mm:ss', 'MINUTE') AS nearest_minute, " +
+ "nearestDate( '2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss', 'HALF_MINUTE') AS nearest_30second, " +
+ "nearestDate( '2019-02-15 07:22:22', 'yyyy-MM-dd HH:mm:ss', 'QUARTER_MINUTE') AS nearest_15second, " +
+ "nearestDate( '2019-02-15 07:22:31', 'yyyy-MM-dd HH:mm:ss', 'SECOND') AS nearest_second " +
+ "FROM (VALUES(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("nearest_year",
+ "nearest_quarter",
+ "nearest_month",
+ "nearest_day",
+ "nearest_week_sunday",
+ "nearest_week_monday",
+ "nearest_hour",
+ "nearest_half_hour",
+ "nearest_quarter_hour",
+ "nearest_minute",
+ "nearest_30second",
+ "nearest_15second",
+ "nearest_second")
+ .baselineValues(LocalDateTime.of(2019, 1, 1, 0, 0, 0), //Year
+ LocalDateTime.of(2019, 1, 1, 0, 0, 0), //Quarter
+ LocalDateTime.of(2019, 2, 1, 0, 0, 0), //Month
+ LocalDateTime.of(2019, 2, 15, 0, 0, 0), //Day
+ LocalDateTime.of(2019, 2, 10, 0, 0, 0), //Week Sunday
+ LocalDateTime.of(2019, 2, 11, 0, 0, 0), //Week Monday
+ LocalDateTime.of(2019, 2, 15, 7, 0, 0), //Hour
+ LocalDateTime.of(2019, 2, 15, 7, 30, 0), //Half Hour
+ LocalDateTime.of(2019, 2, 15, 7, 45, 0), //Quarter Hour
+ LocalDateTime.of(2019, 2, 15, 7, 22, 0), //Minute
+ LocalDateTime.of(2019, 2, 15, 7, 22, 0), //30Second
+ LocalDateTime.of(2019, 2, 15, 7, 22, 15), //15Second
+ LocalDateTime.of(2019, 2, 15, 7, 22, 31)) //Second
+ .go();
+ }
+
+ @Test
+ public void testReadException() throws Exception {
+ String query = "SELECT nearestDate( TO_TIMESTAMP('2019-02-01 07:22:00', 'yyyy-MM-dd HH:mm:ss'), 'BAD_DATE') AS nearest_year " +
+ "FROM (VALUES(1))";
+ try {
+ run(query);
+ fail();
+ } catch (DrillRuntimeException e) {
+ assertTrue(e.getMessage().contains("[BAD_DATE] is not a valid time statement. Expecting: " + Arrays.asList(NearestDateUtils.TimeInterval.values())));
+ }
+ }
+}