You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by gp...@apache.org on 2019/09/08 03:00:11 UTC

[drill] 01/04: DRILL-7343: Add User-Agent UDFs to Drill

This is an automated email from the ASF dual-hosted git repository.

gparai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git

commit bd6d7b11ff3f687f7257b86b17b8fe9e2e134b43
Author: Charles Givre <cg...@apache.org>
AuthorDate: Thu Sep 5 10:29:01 2019 -0400

    DRILL-7343: Add User-Agent UDFs to Drill
    
    closes #1840
---
 contrib/udfs/README.md                             |  56 +++++++
 contrib/udfs/pom.xml                               |   5 +
 .../apache/drill/exec/udfs/UserAgentFunctions.java | 172 +++++++++++++++++++++
 .../drill/exec/udfs/TestUserAgentFunctions.java    | 171 ++++++++++++++++++++
 4 files changed, 404 insertions(+)

diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md
new file mode 100644
index 0000000..c0950e7
--- /dev/null
+++ b/contrib/udfs/README.md
@@ -0,0 +1,56 @@
+# Drill User Defined Functions
+
+This `README` documents functions which users have submitted to Apache Drill.  
+
+## User Agent Functions
+Drill UDF for parsing User Agent Strings.
+This function is based on Niels Basjes Java library for parsing user agent strings which is available here: <https://github.com/nielsbasjes/yauaa>.
+
+### Usage
+The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string. 
+```
+SELECT parse_user_agent( columns[0] ) as ua 
+FROM dfs.`/tmp/data/drill-httpd/ua.csv`;
+```
+The query above returns:
+```
+{
+  "DeviceClass":"Desktop",
+  "DeviceName":"Macintosh",
+  "DeviceBrand":"Apple",
+  "OperatingSystemClass":"Desktop",
+  "OperatingSystemName":"Mac OS X",
+  "OperatingSystemVersion":"10.10.1",
+  "OperatingSystemNameVersion":"Mac OS X 10.10.1",
+  "LayoutEngineClass":"Browser",
+  "LayoutEngineName":"Blink",
+  "LayoutEngineVersion":"39.0",
+  "LayoutEngineVersionMajor":"39",
+  "LayoutEngineNameVersion":"Blink 39.0",
+  "LayoutEngineNameVersionMajor":"Blink 39",
+  "AgentClass":"Browser",
+  "AgentName":"Chrome",
+  "AgentVersion":"39.0.2171.99",
+  "AgentVersionMajor":"39",
+  "AgentNameVersion":"Chrome 39.0.2171.99",
+  "AgentNameVersionMajor":"Chrome 39",
+  "DeviceCpu":"Intel"
+}
+```
+The function returns a Drill map, so you can access any of the fields using Drill's table.map.key notation. For example, the query below illustrates how to extract a field from this map and summarize it:
+
+```
+SELECT uadata.ua.AgentNameVersion AS Browser,
+COUNT( * ) AS BrowserCount
+FROM (
+   SELECT parse_user_agent( columns[0] ) AS ua
+   FROM dfs.drillworkshop.`user-agents.csv`
+) AS uadata
+GROUP BY uadata.ua.AgentNameVersion
+ORDER BY BrowserCount DESC
+```
+The function can also be called with an optional field as an argument. IE:
+```
+SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ...
+```
+which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.  
diff --git a/contrib/udfs/pom.xml b/contrib/udfs/pom.xml
index 38f7dfa..0c0f775 100644
--- a/contrib/udfs/pom.xml
+++ b/contrib/udfs/pom.xml
@@ -63,6 +63,11 @@
       <artifactId>proj4j</artifactId>
       <version>0.1.0</version>
     </dependency>
+    <dependency>
+      <groupId>nl.basjes.parse.useragent</groupId>
+      <artifactId>yauaa</artifactId>
+      <version>5.11</version>
+    </dependency>
 
     <!-- Test dependencies -->
     <dependency>
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
new file mode 100644
index 0000000..f684a2d
--- /dev/null
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import io.netty.buffer.DrillBuf;
+import org.apache.drill.exec.expr.DrillSimpleFunc;
+import org.apache.drill.exec.expr.annotations.FunctionTemplate;
+import org.apache.drill.exec.expr.annotations.Output;
+import org.apache.drill.exec.expr.annotations.Param;
+import org.apache.drill.exec.expr.annotations.Workspace;
+import org.apache.drill.exec.expr.holders.NullableVarCharHolder;
+import org.apache.drill.exec.expr.holders.VarCharHolder;
+import org.apache.drill.exec.vector.complex.writer.BaseWriter;
+
+import javax.inject.Inject;
+
+public class UserAgentFunctions {
+
+  @FunctionTemplate(name = "parse_user_agent",
+    scope = FunctionTemplate.FunctionScope.SIMPLE
+  )
+  public static class UserAgentFunction implements DrillSimpleFunc {
+    @Param
+    VarCharHolder input;
+
+    @Output
+    BaseWriter.ComplexWriter outWriter;
+
+    @Inject
+    DrillBuf outBuffer;
+
+    @Workspace
+    nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+    public void setup() {
+      uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+      uaa.getAllPossibleFieldNamesSorted();
+    }
+
+    public void eval() {
+      org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+
+      String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+
+      nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+
+      for (String fieldName : agent.getAvailableFieldNamesSorted()) {
+
+        org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
+        String field = agent.getValue(fieldName);
+
+        byte[] rowStringBytes = field.getBytes();
+        outBuffer.reallocIfNeeded(rowStringBytes.length);
+        outBuffer.setBytes(0, rowStringBytes);
+
+        rowHolder.start = 0;
+        rowHolder.end = rowStringBytes.length;
+        rowHolder.buffer = outBuffer;
+
+        queryMapWriter.varChar(fieldName).write(rowHolder);
+      }
+    }
+  }
+
+  @FunctionTemplate(name = "parse_user_agent",
+    scope = FunctionTemplate.FunctionScope.SIMPLE
+  )
+  public static class NullableUserAgentFunction implements DrillSimpleFunc {
+    @Param
+    NullableVarCharHolder input;
+
+    @Output
+    BaseWriter.ComplexWriter outWriter;
+
+    @Inject
+    DrillBuf outBuffer;
+
+    @Workspace
+    nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+    public void setup() {
+      uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+      uaa.getAllPossibleFieldNamesSorted();
+    }
+
+    public void eval() {
+      org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
+      if (input.isSet == 0) {
+        // Return empty map
+        queryMapWriter.start();
+        queryMapWriter.end();
+        return;
+      }
+      String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+
+      nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+
+      for (String fieldName : agent.getAvailableFieldNamesSorted()) {
+
+        org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
+        String field = agent.getValue(fieldName);
+
+        byte[] rowStringBytes = field.getBytes();
+        outBuffer.reallocIfNeeded(rowStringBytes.length);
+        outBuffer.setBytes(0, rowStringBytes);
+
+        rowHolder.start = 0;
+        rowHolder.end = rowStringBytes.length;
+        rowHolder.buffer = outBuffer;
+
+        queryMapWriter.varChar(fieldName).write(rowHolder);
+      }
+    }
+  }
+
+  @FunctionTemplate(name = "parse_user_agent",
+    scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
+
+  public static class UserAgentFieldFunction implements DrillSimpleFunc {
+    @Param
+    VarCharHolder input;
+
+    @Param
+    VarCharHolder desiredField;
+
+    @Output
+    VarCharHolder out;
+
+    @Inject
+    DrillBuf outBuffer;
+
+    @Workspace
+    nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa;
+
+    public void setup() {
+      uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build();
+      uaa.getAllPossibleFieldNamesSorted();
+    }
+
+    public void eval() {
+      String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
+      String requestedField = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(desiredField);
+
+      nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
+      String field = agent.getValue(requestedField);
+
+      byte[] rowStringBytes = field.getBytes(java.nio.charset.StandardCharsets.UTF_8);
+      outBuffer.reallocIfNeeded(rowStringBytes.length);
+      outBuffer.setBytes(0, rowStringBytes);
+
+      out.start = 0;
+      out.end = rowStringBytes.length;
+      out.buffer = outBuffer;
+    }
+  }
+}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
new file mode 100644
index 0000000..efa6708
--- /dev/null
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.udfs;
+
+import org.apache.drill.categories.SqlFunctionTest;
+import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.HashMap;
+import java.util.Map;
+
+@Category({UnlikelyTest.class, SqlFunctionTest.class})
+public class TestUserAgentFunctions extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    startCluster(builder);
+  }
+
+  @Test
+  public void testParseUserAgentString() throws Exception {
+    String query = "SELECT t1.ua.DeviceClass AS DeviceClass,\n" +
+      "t1.ua.DeviceName AS DeviceName,\n" +
+      "t1.ua.DeviceBrand AS DeviceBrand,\n" +
+      "t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" +
+      "t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" +
+      "t1.ua.OperatingSystemName AS OperatingSystemName,\n" +
+      "t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" +
+      "t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" +
+      "t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" +
+      "t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" +
+      "t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" +
+      "t1.ua.LayoutEngineName AS LayoutEngineName,\n" +
+      "t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" +
+      "t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" +
+      "t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" +
+      "t1.ua.LayoutEngineBuild AS LayoutEngineBuild,\n" +
+      "t1.ua.AgentClass AS AgentClass,\n" +
+      "t1.ua.AgentName AS AgentName,\n" +
+      "t1.ua.AgentVersion AS AgentVersion,\n" +
+      "t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" +
+      "t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor,\n" +
+      "t1.ua.AgentLanguage AS AgentLanguage,\n" +
+      "t1.ua.AgentLanguageCode AS AgentLanguageCode,\n" +
+      "t1.ua.AgentSecurity AS AgentSecurity\n" +
+      "FROM (SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') AS ua FROM (values(1))) AS t1";
+
+    testBuilder()
+      .sqlQuery(query)
+      .unOrdered()
+      .baselineColumns("DeviceClass", "DeviceName", "DeviceBrand", "DeviceCpuBits", "OperatingSystemClass", "OperatingSystemName", "OperatingSystemVersion", "OperatingSystemVersionMajor", "OperatingSystemNameVersion", "OperatingSystemNameVersionMajor", "LayoutEngineClass", "LayoutEngineName", "LayoutEngineVersion", "LayoutEngineVersionMajor", "LayoutEngineNameVersion", "LayoutEngineBuild", "AgentClass", "AgentName", "AgentVersion", "AgentVersionMajor", "AgentNameVersionMajor", "AgentLang [...]
+      .baselineValues("Desktop", "Desktop", "Unknown", "32", "Desktop", "Windows NT", "XP", "XP", "Windows XP", "Windows XP", "Browser", "Gecko", "1.8.1.11", "1", "Gecko 1.8.1.11", "20071127", "Browser", "Firefox", "2.0.0.11", "2", "Firefox 2", "English (United States)", "en-us", "Strong security")
+      .go();
+  }
+
+  @Test
+  public void testGetHostName() throws Exception {
+    String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'AgentSecurity') AS agent FROM "
+      + "(values(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues("Strong security")
+      .go();
+  }
+
+  @Test
+  public void testEmptyFieldName() throws Exception {
+    String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', '') AS agent FROM " + "(values" +
+      "(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues("Unknown")
+      .go();
+  }
+
+  @Test
+  public void testNullUserAgent() throws Exception {
+    String query = "SELECT parse_user_agent(CAST(null as VARCHAR)) AS agent FROM (values(1))";
+    Map emptyMap = new HashMap();
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues(emptyMap)
+      .go();
+  }
+
+
+  @Test
+  public void testEmptyUAStringAndFieldName() throws Exception {
+    String query = "SELECT parse_user_agent('', '') AS agent FROM (values(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues("Unknown")
+      .go();
+  }
+
+  @Test
+  public void testNullUAStringAndEmptyFieldName() throws Exception {
+    String query = "SELECT parse_user_agent(CAST(null as VARCHAR), '') AS agent FROM (values(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues((String) null)
+      .go();
+  }
+
+  @Test
+  public void testNullUAStringAndNullFieldName() throws Exception {
+    String query = "SELECT parse_user_agent(CAST(null as VARCHAR), CAST(null as VARCHAR)) AS agent FROM (values(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues((String) null)
+      .go();
+  }
+
+  @Test
+  public void testNullUAStringAndFieldName() throws Exception {
+    String query = "SELECT parse_user_agent(CAST(null as VARCHAR), 'AgentSecurity') AS agent FROM (values(1))";
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("agent")
+      .baselineValues((String) null)
+      .go();
+  }
+
+  @Test
+  public void testEmptyUAString() throws Exception {
+    String query = "SELECT t1.ua.AgentName AS AgentName FROM (SELECT parse_user_agent('') AS ua FROM (values(1))) as t1";
+
+    // If the UA string is empty, all returned fields default to "Hacker"
+    testBuilder()
+      .sqlQuery(query)
+      .ordered()
+      .baselineColumns("AgentName")
+      .baselineValues("Hacker")
+      .go();
+  }
+}