You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by gp...@apache.org on 2019/09/08 03:00:11 UTC
[drill] 01/04: DRILL-7343: Add User-Agent UDFs to Drill
This is an automated email from the ASF dual-hosted git repository.
gparai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
commit bd6d7b11ff3f687f7257b86b17b8fe9e2e134b43
Author: Charles Givre <cg...@apache.org>
AuthorDate: Thu Sep 5 10:29:01 2019 -0400
DRILL-7343: Add User-Agent UDFs to Drill
closes #1840
---
contrib/udfs/README.md | 56 +++++++
contrib/udfs/pom.xml | 5 +
.../apache/drill/exec/udfs/UserAgentFunctions.java | 172 +++++++++++++++++++++
.../drill/exec/udfs/TestUserAgentFunctions.java | 171 ++++++++++++++++++++
4 files changed, 404 insertions(+)
diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md
new file mode 100644
index 0000000..c0950e7
--- /dev/null
+++ b/contrib/udfs/README.md
@@ -0,0 +1,56 @@
+# Drill User Defined Functions
+
+This `README` documents functions which users have submitted to Apache Drill.
+
+## User Agent Functions
+Drill UDF for parsing User Agent Strings.
+This function is based on Niels Basjes Java library for parsing user agent strings which is available here: <https://github.com/nielsbasjes/yauaa>.
+
+### Usage
+The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string.
+```
+SELECT parse_user_agent( columns[0] ) as ua
+FROM dfs.`/tmp/data/drill-httpd/ua.csv`;
+```
+The query above returns:
+```
+{
+ "DeviceClass":"Desktop",
+ "DeviceName":"Macintosh",
+ "DeviceBrand":"Apple",
+ "OperatingSystemClass":"Desktop",
+ "OperatingSystemName":"Mac OS X",
+ "OperatingSystemVersion":"10.10.1",
+ "OperatingSystemNameVersion":"Mac OS X 10.10.1",
+ "LayoutEngineClass":"Browser",
+ "LayoutEngineName":"Blink",
+ "LayoutEngineVersion":"39.0",
+ "LayoutEngineVersionMajor":"39",
+ "LayoutEngineNameVersion":"Blink 39.0",
+ "LayoutEngineNameVersionMajor":"Blink 39",
+ "AgentClass":"Browser",
+ "AgentName":"Chrome",
+ "AgentVersion":"39.0.2171.99",
+ "AgentVersionMajor":"39",
+ "AgentNameVersion":"Chrome 39.0.2171.99",
+ "AgentNameVersionMajor":"Chrome 39",
+ "DeviceCpu":"Intel"
+}
+```
+The function returns a Drill map, so you can access any of the fields using Drill's table.map.key notation. For example, the query below illustrates how to extract a field from this map and summarize it:
+
+```
+SELECT uadata.ua.AgentNameVersion AS Browser,
+COUNT( * ) AS BrowserCount
+FROM (
+ SELECT parse_user_agent( columns[0] ) AS ua
+ FROM dfs.drillworkshop.`user-agents.csv`
+) AS uadata
+GROUP BY uadata.ua.AgentNameVersion
+ORDER BY BrowserCount DESC
+```
+The function can also be called with an optional field as an argument. IE:
+```
+SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ...
+```
+which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.
diff --git a/contrib/udfs/pom.xml b/contrib/udfs/pom.xml
index 38f7dfa..0c0f775 100644
--- a/contrib/udfs/pom.xml
+++ b/contrib/udfs/pom.xml
@@ -63,6 +63,11 @@
<artifactId>proj4j</artifactId>
<version>0.1.0</version>
</dependency>
+ <dependency>
+ <groupId>nl.basjes.parse.useragent</groupId>
+ <artifactId>yauaa</artifactId>
+ <version>5.11</version>
+ </dependency>
<!-- Test dependencies -->
<dependency>
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
new file mode 100644
index 0000000..f684a2d
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.NullableVarCharHolder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+
+import javax.inject.Inject;
+
+public class UserAgentFunctions {
+
+ @FunctionTemplate(name = "parse_user_agent",
+ scope = FunctionTemplate.FunctionScope.SIMPLE
+ )
+ public static class UserAgentFunction implements DrillSimpleFunc {
+ @Param
+ VarCharHolder input;
+
+ @Output
+ BaseWriter.ComplexWriter outWriter;
+
+ @Inject
+ DrillBuf outBuffer;
+
+ @Workspace
+ nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+ public void setup() {
+ uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+ uaa.getAllPossibleFieldNamesSorted();
+ }
+
+ public void eval() {
+ org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+
+ String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+
+ nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+
+ for (String fieldName : agent.getAvailableFieldNamesSorted()) {
+
+ org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
+ String field = agent.getValue(fieldName);
+
+ byte[] rowStringBytes = field.getBytes();
+ outBuffer.reallocIfNeeded(rowStringBytes.length);
+ outBuffer.setBytes(0, rowStringBytes);
+
+ rowHolder.start = 0;
+ rowHolder.end = rowStringBytes.length;
+ rowHolder.buffer = outBuffer;
+
+ queryMapWriter.varChar(fieldName).write(rowHolder);
+ }
+ }
+ }
+
+ @FunctionTemplate(name = "parse_user_agent",
+ scope = FunctionTemplate.FunctionScope.SIMPLE
+ )
+ public static class NullableUserAgentFunction implements DrillSimpleFunc {
+ @Param
+ NullableVarCharHolder input;
+
+ @Output
+ BaseWriter.ComplexWriter outWriter;
+
+ @Inject
+ DrillBuf outBuffer;
+
+ @Workspace
+ nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+ public void setup() {
+ uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+ uaa.getAllPossibleFieldNamesSorted();
+ }
+
+ public void eval() {
+ org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+ if (input.isSet == 0) {
+ // Return empty map
+ queryMapWriter.start();
+ queryMapWriter.end();
+ return;
+ }
+ String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+
+ nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+
+ for (String fieldName : agent.getAvailableFieldNamesSorted()) {
+
+ org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
+ String field = agent.getValue(fieldName);
+
+ byte[] rowStringBytes = field.getBytes();
+ outBuffer.reallocIfNeeded(rowStringBytes.length);
+ outBuffer.setBytes(0, rowStringBytes);
+
+ rowHolder.start = 0;
+ rowHolder.end = rowStringBytes.length;
+ rowHolder.buffer = outBuffer;
+
+ queryMapWriter.varChar(fieldName).write(rowHolder);
+ }
+ }
+ }
+
+ @FunctionTemplate(name = "parse_user_agent",
+ scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+
+ public static class UserAgentFieldFunction implements DrillSimpleFunc {
+ @Param
+ VarCharHolder input;
+
+ @Param
+ VarCharHolder desiredField;
+
+ @Output
+ VarCharHolder out;
+
+ @Inject
+ DrillBuf outBuffer;
+
+ @Workspace
+ nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+ public void setup() {
+ uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+ uaa.getAllPossibleFieldNamesSorted();
+ }
+
+ public void eval() {
+ String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+ String requestedField = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(desiredField);
+
+ nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+ String field = agent.getValue(requestedField);
+
+ byte[] rowStringBytes = field.getBytes(java.nio.charset.StandardCharsets.UTF_8);
+ outBuffer.reallocIfNeeded(rowStringBytes.length);
+ outBuffer.setBytes(0, rowStringBytes);
+
+ out.start = 0;
+ out.end = rowStringBytes.length;
+ out.buffer = outBuffer;
+ }
+ }
+}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
new file mode 100644
index 0000000..efa6708
--- /dev/null
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.HashMap;
+import java.util.Map;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestUserAgentFunctions extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+ startCluster(builder);
+ }
+
+ @Test
+ public void testParseUserAgentString() throws Exception {
+ String query = "SELECT t1.ua.DeviceClass AS DeviceClass,\n" +
+ "t1.ua.DeviceName AS DeviceName,\n" +
+ "t1.ua.DeviceBrand AS DeviceBrand,\n" +
+ "t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" +
+ "t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" +
+ "t1.ua.OperatingSystemName AS OperatingSystemName,\n" +
+ "t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" +
+ "t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" +
+ "t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" +
+ "t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" +
+ "t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" +
+ "t1.ua.LayoutEngineName AS LayoutEngineName,\n" +
+ "t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" +
+ "t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" +
+ "t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" +
+ "t1.ua.LayoutEngineBuild AS LayoutEngineBuild,\n" +
+ "t1.ua.AgentClass AS AgentClass,\n" +
+ "t1.ua.AgentName AS AgentName,\n" +
+ "t1.ua.AgentVersion AS AgentVersion,\n" +
+ "t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" +
+ "t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor,\n" +
+ "t1.ua.AgentLanguage AS AgentLanguage,\n" +
+ "t1.ua.AgentLanguageCode AS AgentLanguageCode,\n" +
+ "t1.ua.AgentSecurity AS AgentSecurity\n" +
+ "FROM (SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') AS ua FROM (values(1))) AS t1";
+
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("DeviceClass", "DeviceName", "DeviceBrand", "DeviceCpuBits", "OperatingSystemClass", "OperatingSystemName", "OperatingSystemVersion", "OperatingSystemVersionMajor", "OperatingSystemNameVersion", "OperatingSystemNameVersionMajor", "LayoutEngineClass", "LayoutEngineName", "LayoutEngineVersion", "LayoutEngineVersionMajor", "LayoutEngineNameVersion", "LayoutEngineBuild", "AgentClass", "AgentName", "AgentVersion", "AgentVersionMajor", "AgentNameVersionMajor", "AgentLang [...]
+ .baselineValues("Desktop", "Desktop", "Unknown", "32", "Desktop", "Windows NT", "XP", "XP", "Windows XP", "Windows XP", "Browser", "Gecko", "1.8.1.11", "1", "Gecko 1.8.1.11", "20071127", "Browser", "Firefox", "2.0.0.11", "2", "Firefox 2", "English (United States)", "en-us", "Strong security")
+ .go();
+ }
+
+ @Test
+ public void testGetHostName() throws Exception {
+ String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'AgentSecurity') AS agent FROM "
+ + "(values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues("Strong security")
+ .go();
+ }
+
+ @Test
+ public void testEmptyFieldName() throws Exception {
+ String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', '') AS agent FROM " + "(values" +
+ "(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues("Unknown")
+ .go();
+ }
+
+ @Test
+ public void testNullUserAgent() throws Exception {
+ String query = "SELECT parse_user_agent(CAST(null as VARCHAR)) AS agent FROM (values(1))";
+ Map emptyMap = new HashMap();
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues(emptyMap)
+ .go();
+ }
+
+
+ @Test
+ public void testEmptyUAStringAndFieldName() throws Exception {
+ String query = "SELECT parse_user_agent('', '') AS agent FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues("Unknown")
+ .go();
+ }
+
+ @Test
+ public void testNullUAStringAndEmptyFieldName() throws Exception {
+ String query = "SELECT parse_user_agent(CAST(null as VARCHAR), '') AS agent FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues((String) null)
+ .go();
+ }
+
+ @Test
+ public void testNullUAStringAndNullFieldName() throws Exception {
+ String query = "SELECT parse_user_agent(CAST(null as VARCHAR), CAST(null as VARCHAR)) AS agent FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues((String) null)
+ .go();
+ }
+
+ @Test
+ public void testNullUAStringAndFieldName() throws Exception {
+ String query = "SELECT parse_user_agent(CAST(null as VARCHAR), 'AgentSecurity') AS agent FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues((String) null)
+ .go();
+ }
+
+ @Test
+ public void testEmptyUAString() throws Exception {
+ String query = "SELECT t1.ua.AgentName AS AgentName FROM (SELECT parse_user_agent('') AS ua FROM (values(1))) as t1";
+
+ // If the UA string is empty, all returned fields default to "Hacker"
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("AgentName")
+ .baselineValues("Hacker")
+ .go();
+ }
+}