You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by cg...@apache.org on 2022/05/19 00:57:15 UTC
[drill] branch master updated: DRILL-8225: Update LogParser and Yauaa to support User-Agent Client Hints (#2549)
This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new aed78f33c8 DRILL-8225: Update LogParser and Yauaa to support User-Agent Client Hints (#2549)
aed78f33c8 is described below
commit aed78f33c8d93c2850897ae1bd5c228d84c30de3
Author: Niels Basjes <ni...@basjes.nl>
AuthorDate: Thu May 19 02:57:08 2022 +0200
DRILL-8225: Update LogParser and Yauaa to support User-Agent Client Hints (#2549)
* DRILL-8225: Update LogParser and Yauaa to support User-Agent Client Hints
* DRILL-8225: Replace Caffeine caching with Java 8 compliant solution
* DRILL-8225: Fix dependency issues
* DRILL-8225: Use new API for JDK8 caching
---
contrib/format-excel/pom.xml | 1 -
contrib/format-httpd/pom.xml | 20 ++
contrib/udfs/README.md | 108 ++++++-
contrib/udfs/pom.xml | 15 +
.../drill/exec/udfs/UserAgentAnalyzerProvider.java | 38 +++
.../apache/drill/exec/udfs/UserAgentFunctions.java | 82 +-----
.../drill/exec/udfs/TestUserAgentFunctions.java | 324 ++++++++++++++++++---
pom.xml | 5 +-
8 files changed, 488 insertions(+), 105 deletions(-)
diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 619aac4323..ed5a02494c 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml
@@ -32,7 +32,6 @@
<properties>
<poi.version>5.2.1</poi.version>
- <log4j.version>2.17.2</log4j.version>
</properties>
<dependencies>
<dependency>
diff --git a/contrib/format-httpd/pom.xml b/contrib/format-httpd/pom.xml
index f8c71e736e..4731b6b69c 100644
--- a/contrib/format-httpd/pom.xml
+++ b/contrib/format-httpd/pom.xml
@@ -60,9 +60,29 @@
<groupId>nl.basjes.parse.httpdlog</groupId>
<artifactId>httpdlog-parser</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>com.github.ben-manes.caffeine</groupId>
+ <artifactId>caffeine</artifactId>
+ </exclusion>
</exclusions>
</dependency>
+ <!-- The default logging implementation for Yauaa -->
+ <!-- Send all Log4j2 calls to SLF4J -->
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-api</artifactId>
+ <version>${log4j.version}</version>
+ </dependency>
+
+ <!-- The default logging implementation for Yauaa -->
+ <!-- Send all Log4j2 calls to SLF4J -->
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-to-slf4j</artifactId>
+ <version>${log4j.version}</version>
+ </dependency>
+
<!-- Test dependencies -->
<dependency>
<groupId>org.apache.drill.exec</groupId>
diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md
index ae65e1d11b..6f6799d5d3 100644
--- a/contrib/udfs/README.md
+++ b/contrib/udfs/README.md
@@ -224,8 +224,24 @@ SELECT time_bucket(time_stamp, 30000) AS five_min, avg(cpu)
Drill UDF for parsing User Agent Strings.
This function is based on Niels Basjes Java library for parsing user agent strings which is available here: <https://github.com/nielsbasjes/yauaa>.
-### Usage
+### Basic usage
The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string.
+
+The basic function signature looks like this
+
+ parse_user_agent ( <useragent> )
+ parse_user_agent ( <useragent> , <desired fieldname> )
+
+to support the analysis of the Client Hints it now also supports
+
+ parse_user_agent ( <useragent> , [<header name>,<value>]+ )
+
+or the variant which requires the presence of a `User-Agent` header.
+
+ parse_user_agent ( [<header name>,<value>]+ )
+
+### Analyzing the User-Agent
+
```
SELECT parse_user_agent( columns[0] ) as ua
FROM dfs.`/tmp/data/drill-httpd/ua.csv`;
@@ -273,6 +289,96 @@ SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ...
```
which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.
+### Analyzing the User-Agent Client Hints
+
+Assume an Apache Httpd webserver with the following LogFormat config:
+
+ LogFormat "%a %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" \"%{Sec-CH-UA}i\" \"%{Sec-CH-UA-Arch}i\" \"%{Sec-CH-UA-Bitness}i\" \"%{Sec-CH-UA-Full-Version}i\" \"%{Sec-CH-UA-Full-Version-List}i\" \"%{Sec-CH-UA-Mobile}i\" \"%{Sec-CH-UA-Model}i\" \"%{Sec-CH-UA-Platform}i\" \"%{Sec-CH-UA-Platform-Version}i\" \"%{Sec-CH-UA-WoW64}i\" %V" combinedhintsvhost
+
+Behind this Apache Httpd webserver is a website that returns the header
+
+ Accept-CH: Sec-CH-UA, Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Mobile, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version, Sec-CH-UA-WoW64
+
+With all of this in place: these are two of the lines that are found in the access log of this Apache Httpd webserver:
+
+ 45.138.228.54 - - [02/May/2022:12:25:10 +0200] "GET / HTTP/1.1" 200 16141 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"100\", \"Google Chrome\";v=\"100\"" "\"x86\"" "\"64\"" "\"100.0.4896.127\"" "\" Not A;Brand\";v=\"99.0.0.0\", \"Chromium\";v=\"100.0.4896.127\", \"Google Chrome\";v=\"100.0.4896.127\"" "?0" "\"\"" "\"Linux\"" "\"5.13.0\"" "?0" try.yauaa.basjes.nl
+ 45.138.228.54 - - [02/May/2022:12:25:34 +0200] "GET / HTTP/1.1" 200 15376 "-" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Mobile Safari/537.36" "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"" "\"\"" "-" "\"101.0.4951.41\"" "\" Not A;Brand\";v=\"99.0.0.0\", \"Chromium\";v=\"101.0.4951.41\", \"Google Chrome\";v=\"101.0.4951.41\"" "?1" "\"Nokia 7.2\"" "\"Android\"" "\"11.0.0\"" "?0" try.yauaa.basjes.nl
+
+For this example the name of this file is `access.hints`
+
+When doing a query on this data and ONLY use the User-Agent as the input:
+
+ SELECT uadata.ua.DeviceClass AS DeviceClass,
+ uadata.ua.AgentNameVersionMajor AS AgentNameVersionMajor,
+ uadata.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion
+ FROM (
+ SELECT
+ parse_user_agent(`request_user-agent`) AS ua
+ FROM table(
+ dfs.`/tmp/access.hints` (
+ type => 'httpd',
+ logFormat => '%a %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" "%{Sec-CH-UA}i" "%{Sec-CH-UA-Arch}i" "%{Sec-CH-UA-Bitness}i" "%{Sec-CH-UA-Full-Version}i" "%{Sec-CH-UA-Full-Version-List}i" "%{Sec-CH-UA-Mobile}i" "%{Sec-CH-UA-Model}i" "%{Sec-CH-UA-Platform}i" "%{Sec-CH-UA-Platform-Version}i" "%{Sec-CH-UA-WoW64}i" %V',
+ flattenWildcards => true
+ )
+ )
+ ) AS uadata;
+
+it produces
+
+ +-------------+-----------------------+----------------------------+
+ | DeviceClass | AgentNameVersionMajor | OperatingSystemNameVersion |
+ +-------------+-----------------------+----------------------------+
+ | Desktop | Chrome 100 | Linux ?? |
+ | Phone | Chrome 101 | Android ?? |
+ +-------------+-----------------------+----------------------------+
+ 2 rows selected (0.183 seconds)
+
+The first example here does not have the exact version of the operating system as part of the User-Agent and this results in `Linux ??`.
+
+The second example shows `Android 10` but was recognized as being a `reduced` variant of the `User-Agent`, this means that the version `10` is an invalid standard value that is not true. So here you see `Android ??`. See https://www.chromium.org/updates/ua-reduction
+
+Now let's repeat the same and use the recorded `User-Agent Client Hint` header values:
+
+ SELECT uadata.ua.DeviceClass AS DeviceClass,
+ uadata.ua.AgentNameVersionMajor AS AgentNameVersionMajor,
+ uadata.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion
+ FROM (
+ SELECT
+ parse_user_agent(
+ 'User-Agent' , `request_user-agent`,
+ 'sec-ch-ua', `request_header_sec-ch-ua`,
+ 'sec-ch-ua-arch', `request_header_sec-ch-ua-arch`,
+ 'sec-ch-ua-bitness', `request_header_sec-ch-ua-bitness`,
+ 'sec-ch-ua-full-version', `request_header_sec-ch-ua-full-version`,
+ 'sec-ch-ua-full-version-list', `request_header_sec-ch-ua-full-version-list`,
+ 'sec-ch-ua-mobile', `request_header_sec-ch-ua-mobile`,
+ 'sec-ch-ua-model', `request_header_sec-ch-ua-model`,
+ 'sec-ch-ua-platform', `request_header_sec-ch-ua-platform`,
+ 'sec-ch-ua-platform-version', `request_header_sec-ch-ua-platform-version`,
+ 'sec-ch-ua-wow64', `request_header_sec-ch-ua-wow64`
+ ) AS ua
+ FROM table(
+ dfs.`/tmp/access.hints` (
+ type => 'httpd',
+ logFormat => '%a %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" "%{Sec-CH-UA}i" "%{Sec-CH-UA-Arch}i" "%{Sec-CH-UA-Bitness}i" "%{Sec-CH-UA-Full-Version}i" "%{Sec-CH-UA-Full-Version-List}i" "%{Sec-CH-UA-Mobile}i" "%{Sec-CH-UA-Model}i" "%{Sec-CH-UA-Platform}i" "%{Sec-CH-UA-Platform-Version}i" "%{Sec-CH-UA-WoW64}i" %V',
+ flattenWildcards => true
+ )
+ )
+ ) AS uadata;
+
+
+which produces
+
+ +-------------+-----------------------+----------------------------+
+ | DeviceClass | AgentNameVersionMajor | OperatingSystemNameVersion |
+ +-------------+-----------------------+----------------------------+
+ | Desktop | Chrome 100 | Linux 5.13.0 |
+ | Phone | Chrome 101 | Android 11.0.0 |
+ +-------------+-----------------------+----------------------------+
+ 2 rows selected (0.275 seconds)
+
+The improvement after adding the Client Hints is evident.
+
## Map Schema Function
This function allows you to drill down into the schema of maps. The REST API and JDBC interfaces will only return `MAP`, `LIST` for the MAP, however, it is not possible to get
the schema of the inner map. The function `getMapSchema(<MAP>)` will return a `MAP` of the fields and datatypes.
diff --git a/contrib/udfs/pom.xml b/contrib/udfs/pom.xml
index 5d50472170..a33085bf49 100644
--- a/contrib/udfs/pom.xml
+++ b/contrib/udfs/pom.xml
@@ -68,6 +68,12 @@
<groupId>nl.basjes.parse.useragent</groupId>
<artifactId>yauaa</artifactId>
<version>${yauaa.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>com.github.ben-manes.caffeine</groupId>
+ <artifactId>caffeine</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<!-- Test dependencies -->
@@ -86,6 +92,15 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+
+ <!-- The default logging implementation for Yauaa -->
+ <!-- Send all Log4j2 calls to SLF4J -->
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-to-slf4j</artifactId>
+ <version>${log4j.version}</version>
+ </dependency>
+
</dependencies>
<build>
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentAnalyzerProvider.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentAnalyzerProvider.java
index 5094527b16..e3cdaa4e1c 100644
--- a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentAnalyzerProvider.java
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentAnalyzerProvider.java
@@ -18,7 +18,16 @@
package org.apache.drill.exec.udfs;
+import nl.basjes.parse.useragent.AnalyzerUtilities.ParsedArguments;
import nl.basjes.parse.useragent.UserAgentAnalyzer;
+import org.apache.drill.exec.expr.holders.NullableVarCharHolder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static nl.basjes.parse.useragent.AnalyzerUtilities.parseArguments;
+import static nl.basjes.parse.useragent.UserAgent.USERAGENT_HEADER;
+import static org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder;
public class UserAgentAnalyzerProvider {
@@ -26,11 +35,40 @@ public class UserAgentAnalyzerProvider {
return UserAgentAnalyzerHolder.INSTANCE;
}
+ public static List<String> getAllFields() {
+ return UserAgentAnalyzerHolder.INSTANCE.getAllPossibleFieldNamesSorted();
+ }
+
+ private static List<String> allHeaders = null;
+
+ public static synchronized List<String> getAllHeaders() {
+ if (allHeaders == null) {
+ allHeaders = new ArrayList<>();
+ allHeaders.add(USERAGENT_HEADER);
+ allHeaders.addAll(getInstance().supportedClientHintHeaders());
+ }
+ return allHeaders;
+ }
+
private static class UserAgentAnalyzerHolder {
private static final UserAgentAnalyzer INSTANCE = UserAgentAnalyzer.newBuilder()
.dropTests()
.hideMatcherLoadStats()
+ // Caffeine is a Java 11+ library.
+ .useJava8CompatibleCaching()
.immediateInitialization()
.build();
}
+
+ public static ParsedArguments parseArgumentArray(NullableVarCharHolder[] input) {
+ List<String> inputList = new ArrayList<>();
+ for (NullableVarCharHolder holder : input) {
+ if (holder == null || holder.buffer == null) {
+ inputList.add(null);
+ } else {
+ inputList.add(getStringFromVarCharHolder(holder));
+ }
+ }
+ return parseArguments(inputList, getAllFields(), getAllHeaders());
+ }
}
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
index 40f97a1a1d..39b7bccee3 100644
--- a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java
@@ -24,23 +24,22 @@ import org.apache.drill.exec.expr.annotations.FunctionTemplate;
import org.apache.drill.exec.expr.annotations.Output;
import org.apache.drill.exec.expr.annotations.Param;
import org.apache.drill.exec.expr.annotations.Workspace;
-import org.apache.drill.exec.expr.holders.NullableVarCharHolder;
-import org.apache.drill.exec.expr.holders.VarCharHolder;
-import org.apache.drill.exec.vector.complex.writer.BaseWriter;
import javax.inject.Inject;
public class UserAgentFunctions {
- @FunctionTemplate(name = "parse_user_agent",
+ @FunctionTemplate(
+ name = "parse_user_agent",
+ isVarArg = true,
scope = FunctionTemplate.FunctionScope.SIMPLE
)
public static class UserAgentFunction implements DrillSimpleFunc {
@Param
- VarCharHolder input;
+ org.apache.drill.exec.expr.holders.NullableVarCharHolder[] input;
@Output
- BaseWriter.ComplexWriter outWriter;
+ org.apache.drill.exec.vector.complex.writer.BaseWriter.ComplexWriter outWriter;
@Inject
DrillBuf outBuffer;
@@ -53,72 +52,23 @@ public class UserAgentFunctions {
public void setup() {
uaa = org.apache.drill.exec.udfs.UserAgentAnalyzerProvider.getInstance();
- allFields = uaa.getAllPossibleFieldNamesSorted();
+ allFields = org.apache.drill.exec.udfs.UserAgentAnalyzerProvider.getAllFields();
}
public void eval() {
org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
- String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
-
- nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
-
- for (String fieldName: allFields) {
-
- org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
- String field = agent.getValue(fieldName);
-
- byte[] rowStringBytes = field.getBytes();
- outBuffer = outBuffer.reallocIfNeeded(rowStringBytes.length);
- outBuffer.setBytes(0, rowStringBytes);
-
- rowHolder.start = 0;
- rowHolder.end = rowStringBytes.length;
- rowHolder.buffer = outBuffer;
-
- queryMapWriter.varChar(fieldName).write(rowHolder);
- }
- }
- }
-
- @FunctionTemplate(name = "parse_user_agent",
- scope = FunctionTemplate.FunctionScope.SIMPLE
- )
- public static class NullableUserAgentFunction implements DrillSimpleFunc {
- @Param
- NullableVarCharHolder input;
-
- @Output
- BaseWriter.ComplexWriter outWriter;
-
- @Inject
- DrillBuf outBuffer;
+ nl.basjes.parse.useragent.AnalyzerUtilities.ParsedArguments parsedArguments =
+ org.apache.drill.exec.udfs.UserAgentAnalyzerProvider.parseArgumentArray(input);
- @Workspace
- nl.basjes.parse.useragent.UserAgentAnalyzer uaa;
+ nl.basjes.parse.useragent.UserAgent agent = uaa.parse(parsedArguments.getRequestHeaders());
- @Workspace
- java.util.List<String> allFields;
-
- public void setup() {
- uaa = org.apache.drill.exec.udfs.UserAgentAnalyzerProvider.getInstance();
- allFields = uaa.getAllPossibleFieldNamesSorted();
- }
-
- public void eval() {
- org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap();
- if (input.isSet == 0) {
- // Return empty map
- queryMapWriter.start();
- queryMapWriter.end();
- return;
+ java.util.List<String> wantedFields = parsedArguments.getWantedFields();
+ if (wantedFields.isEmpty()) {
+ wantedFields = allFields;
}
- String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input);
-
- nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
-
- for (String fieldName: allFields) {
+ for (String fieldName : wantedFields) {
org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder();
String field = agent.getValue(fieldName);
@@ -140,13 +90,13 @@ public class UserAgentFunctions {
public static class UserAgentFieldFunction implements DrillSimpleFunc {
@Param
- VarCharHolder input;
+ org.apache.drill.exec.expr.holders.VarCharHolder input;
@Param
- VarCharHolder desiredField;
+ org.apache.drill.exec.expr.holders.VarCharHolder desiredField;
@Output
- VarCharHolder out;
+ org.apache.drill.exec.expr.holders.VarCharHolder out;
@Inject
DrillBuf outBuffer;
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
index 49b700156b..0ac40ed774 100644
--- a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java
@@ -18,8 +18,11 @@
package org.apache.drill.exec.udfs;
+import nl.basjes.parse.useragent.UserAgentAnalyzer;
import org.apache.drill.categories.SqlFunctionTest;
import org.apache.drill.categories.UnlikelyTest;
+import org.apache.drill.common.expression.ExpressionStringBuilder;
+import org.apache.drill.exec.util.Text;
import org.apache.drill.test.ClusterFixture;
import org.apache.drill.test.ClusterFixtureBuilder;
import org.apache.drill.test.ClusterTest;
@@ -28,7 +31,12 @@ import org.junit.Test;
import org.junit.experimental.categories.Category;
import java.util.Collections;
+import java.util.LinkedHashMap;
import java.util.Map;
+import java.util.TreeMap;
+
+import static org.apache.drill.test.TestBuilder.parsePath;
+import static org.junit.Assert.assertEquals;
@Category({UnlikelyTest.class, SqlFunctionTest.class})
public class TestUserAgentFunctions extends ClusterTest {
@@ -41,44 +49,77 @@ public class TestUserAgentFunctions extends ClusterTest {
@Test
public void testParseUserAgentString() throws Exception {
- String query = "SELECT t1.ua.DeviceClass AS DeviceClass,\n" +
- "t1.ua.DeviceName AS DeviceName,\n" +
- "t1.ua.DeviceBrand AS DeviceBrand,\n" +
- "t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" +
- "t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" +
- "t1.ua.OperatingSystemName AS OperatingSystemName,\n" +
- "t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" +
- "t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" +
- "t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" +
- "t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" +
- "t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" +
- "t1.ua.LayoutEngineName AS LayoutEngineName,\n" +
- "t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" +
- "t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" +
- "t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" +
- "t1.ua.LayoutEngineBuild AS LayoutEngineBuild,\n" +
- "t1.ua.AgentClass AS AgentClass,\n" +
- "t1.ua.AgentName AS AgentName,\n" +
- "t1.ua.AgentVersion AS AgentVersion,\n" +
- "t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" +
- "t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor,\n" +
- "t1.ua.AgentLanguage AS AgentLanguage,\n" +
- "t1.ua.AgentLanguageCode AS AgentLanguageCode,\n" +
- "t1.ua.AgentSecurity AS AgentSecurity\n" +
- "FROM (SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') AS ua FROM (values(1))) AS t1";
+ String query =
+ "SELECT t1.ua.DeviceClass AS DeviceClass," +
+ " t1.ua.DeviceName AS DeviceName," +
+ " t1.ua.DeviceBrand AS DeviceBrand," +
+ " t1.ua.DeviceCpuBits AS DeviceCpuBits," +
+ " t1.ua.OperatingSystemClass AS OperatingSystemClass," +
+ " t1.ua.OperatingSystemName AS OperatingSystemName," +
+ " t1.ua.OperatingSystemVersion AS OperatingSystemVersion," +
+ " t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor," +
+ " t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion," +
+ " t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor," +
+ " t1.ua.LayoutEngineClass AS LayoutEngineClass," +
+ " t1.ua.LayoutEngineName AS LayoutEngineName," +
+ " t1.ua.LayoutEngineVersion AS LayoutEngineVersion," +
+ " t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor," +
+ " t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion," +
+ " t1.ua.LayoutEngineBuild AS LayoutEngineBuild," +
+ " t1.ua.AgentClass AS AgentClass," +
+ " t1.ua.AgentName AS AgentName," +
+ " t1.ua.AgentVersion AS AgentVersion," +
+ " t1.ua.AgentVersionMajor AS AgentVersionMajor," +
+ " t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor," +
+ " t1.ua.AgentLanguage AS AgentLanguage," +
+ " t1.ua.AgentLanguageCode AS AgentLanguageCode," +
+ " t1.ua.AgentSecurity AS AgentSecurity " +
+ "FROM (" +
+ " SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') AS ua" +
+ " FROM (values(1))" +
+ ") AS t1";
testBuilder()
.sqlQuery(query)
.unOrdered()
- .baselineColumns("DeviceClass", "DeviceName", "DeviceBrand", "DeviceCpuBits", "OperatingSystemClass", "OperatingSystemName", "OperatingSystemVersion", "OperatingSystemVersionMajor", "OperatingSystemNameVersion", "OperatingSystemNameVersionMajor", "LayoutEngineClass", "LayoutEngineName", "LayoutEngineVersion", "LayoutEngineVersionMajor", "LayoutEngineNameVersion", "LayoutEngineBuild", "AgentClass", "AgentName", "AgentVersion", "AgentVersionMajor", "AgentNameVersionMajor", "AgentLang [...]
- .baselineValues("Desktop", "Desktop", "Unknown", "32", "Desktop", "Windows NT", "XP", "XP", "Windows XP", "Windows XP", "Browser", "Gecko", "1.8.1.11", "1", "Gecko 1.8.1.11", "20071127", "Browser", "Firefox", "2.0.0.11", "2", "Firefox 2", "English (United States)", "en-us", "Strong security")
+ .baselineRecords(
+ Collections.singletonList(// Singleton list because we expect 1 record
+ expectations(
+ "DeviceClass", "Desktop",
+ "DeviceName", "Desktop",
+ "DeviceBrand", "Unknown",
+ "DeviceCpuBits", "32",
+ "OperatingSystemClass", "Desktop",
+ "OperatingSystemName", "Windows NT",
+ "OperatingSystemVersion", "XP",
+ "OperatingSystemVersionMajor", "XP",
+ "OperatingSystemNameVersion", "Windows XP",
+ "OperatingSystemNameVersionMajor", "Windows XP",
+ "LayoutEngineClass", "Browser",
+ "LayoutEngineName", "Gecko",
+ "LayoutEngineVersion", "1.8.1.11",
+ "LayoutEngineVersionMajor", "1",
+ "LayoutEngineNameVersion", "Gecko 1.8.1.11",
+ "LayoutEngineBuild", "20071127",
+ "AgentClass", "Browser",
+ "AgentName", "Firefox",
+ "AgentVersion", "2.0.0.11",
+ "AgentVersionMajor", "2",
+ "AgentNameVersionMajor", "Firefox 2",
+ "AgentLanguage", "English (United States)",
+ "AgentLanguageCode", "en-us",
+ "AgentSecurity", "Strong security"
+ )
+ )
+ )
.go();
}
@Test
- public void testGetHostName() throws Exception {
- String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'AgentSecurity') AS agent FROM "
- + "(values(1))";
+ public void testValidFieldName() throws Exception {
+ String query =
+ "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'AgentSecurity') AS agent " +
+ "FROM (values(1))";
testBuilder()
.sqlQuery(query)
.ordered()
@@ -89,8 +130,22 @@ public class TestUserAgentFunctions extends ClusterTest {
@Test
public void testEmptyFieldName() throws Exception {
- String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', '') AS agent FROM " + "(values" +
- "(1))";
+ String query =
+ "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', '') AS agent " +
+ "FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues("Unknown")
+ .go();
+ }
+
+ @Test
+ public void testBadFieldName() throws Exception {
+ String query =
+ "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'NoSuchField') AS agent " +
+ "FROM (values(1))";
testBuilder()
.sqlQuery(query)
.ordered()
@@ -101,17 +156,24 @@ public class TestUserAgentFunctions extends ClusterTest {
@Test
public void testNullUserAgent() throws Exception {
+ // If a null value is provided then the UserAgentAnalyzer will classify this as a Hacker because all requests normally have a User-Agent.
+ UserAgentAnalyzer analyzer = UserAgentAnalyzer.newBuilder().showMinimalVersion().withoutCache().dropTests().immediateInitialization().build();
+ Map<String, String> expected = analyzer.parse((String)null).toMap(analyzer.getAllPossibleFieldNamesSorted());
+
+ Map<String, Text> expectedRecord = new TreeMap<>();
+ for (Map.Entry<String, String> entry : expected.entrySet()) {
+ expectedRecord.put(entry.getKey(), new Text(entry.getValue()));
+ }
+
String query = "SELECT parse_user_agent(CAST(null as VARCHAR)) AS agent FROM (values(1))";
- Map<?, ?> emptyMap = Collections.emptyMap();
testBuilder()
.sqlQuery(query)
.ordered()
.baselineColumns("agent")
- .baselineValues(emptyMap)
+ .baselineValues(expectedRecord)
.go();
}
-
@Test
public void testEmptyUAStringAndFieldName() throws Exception {
String query = "SELECT parse_user_agent('', '') AS agent FROM (values(1))";
@@ -134,6 +196,17 @@ public class TestUserAgentFunctions extends ClusterTest {
.go();
}
+ @Test
+ public void testNullUAStringAndBadFieldName() throws Exception {
+ String query = "SELECT parse_user_agent(CAST(null as VARCHAR), 'NoSuchField') AS agent FROM (values(1))";
+ testBuilder()
+ .sqlQuery(query)
+ .ordered()
+ .baselineColumns("agent")
+ .baselineValues((String) null)
+ .go();
+ }
+
@Test
public void testNullUAStringAndNullFieldName() throws Exception {
String query = "SELECT parse_user_agent(CAST(null as VARCHAR), CAST(null as VARCHAR)) AS agent FROM (values(1))";
@@ -168,4 +241,185 @@ public class TestUserAgentFunctions extends ClusterTest {
.baselineValues("Hacker")
.go();
}
+
+ @Test
+ public void testClientHints() throws Exception {
+ String query =
+ "SELECT " +
+ " t1.ua.DeviceClass AS DeviceClass,\n" +
+ " t1.ua.DeviceName AS DeviceName,\n" +
+ " t1.ua.DeviceBrand AS DeviceBrand,\n" +
+ " t1.ua.DeviceCpu AS DeviceCpu,\n" +
+ " t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" +
+ " t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" +
+ " t1.ua.OperatingSystemName AS OperatingSystemName,\n" +
+ " t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" +
+ " t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" +
+ " t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" +
+ " t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" +
+ " t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" +
+ " t1.ua.LayoutEngineName AS LayoutEngineName,\n" +
+ " t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" +
+ " t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" +
+ " t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" +
+ " t1.ua.LayoutEngineNameVersionMajor AS LayoutEngineNameVersionMajor,\n" +
+ " t1.ua.AgentClass AS AgentClass,\n" +
+ " t1.ua.AgentName AS AgentName,\n" +
+ " t1.ua.AgentVersion AS AgentVersion,\n" +
+ " t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" +
+ " t1.ua.AgentNameVersion AS AgentNameVersion,\n" +
+ " t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor\n" +
+ "FROM (" +
+ " SELECT" +
+ " parse_user_agent(" +
+ " 'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'," +
+ " 'Sec-Ch-Ua', '\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"100\", \"Google Chrome\";v=\"100\"'," +
+ " 'Sec-Ch-Ua-Arch', '\"x86\"'," +
+ " 'Sec-Ch-Ua-Bitness', '\"64\"'," +
+ " 'Sec-Ch-Ua-Full-Version', '\"100.0.4896.127\"'," +
+ " 'Sec-Ch-Ua-Full-Version-List', '\" Not A;Brand\";v=\"99.0.0.0\", \"Chromium\";v=\"100.0.4896.127\", \"Google Chrome\";v=\"100.0.4896.127\"'," +
+ " 'Sec-Ch-Ua-Mobile', '?0'," +
+ " 'Sec-Ch-Ua-Model', '\"\"'," +
+ " 'Sec-Ch-Ua-Platform', '\"Linux\"'," +
+ " 'Sec-Ch-Ua-Platform-Version', '\"5.13.0\"'," +
+ " 'Sec-Ch-Ua-Wow64', '?0'" +
+ " ) AS ua " +
+ " FROM (values(1))" +
+ ") AS t1";
+
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineRecords(
+ Collections.singletonList(// Singleton list because we expect 1 record
+ expectations(
+ "DeviceClass", "Desktop",
+ "DeviceName", "Linux Desktop",
+ "DeviceBrand", "Unknown",
+ "DeviceCpu", "Intel x86_64",
+ "DeviceCpuBits", "64",
+ "OperatingSystemClass", "Desktop",
+ "OperatingSystemName", "Linux",
+ "OperatingSystemVersion", "5.13.0",
+ "OperatingSystemVersionMajor", "5",
+ "OperatingSystemNameVersion", "Linux 5.13.0",
+ "OperatingSystemNameVersionMajor", "Linux 5",
+ "LayoutEngineClass", "Browser",
+ "LayoutEngineName", "Blink",
+ "LayoutEngineVersion", "100.0",
+ "LayoutEngineVersionMajor", "100",
+ "LayoutEngineNameVersion", "Blink 100.0",
+ "LayoutEngineNameVersionMajor", "Blink 100",
+ "AgentClass", "Browser",
+ "AgentName", "Chrome",
+ "AgentVersion", "100.0.4896.127",
+ "AgentVersionMajor", "100",
+ "AgentNameVersion", "Chrome 100.0.4896.127",
+ "AgentNameVersionMajor", "Chrome 100"
+ )
+ )
+ )
+ .go();
+ }
+
+ // ====================================================================
+
+ @Test
+ public void testEmptyClientHints() throws Exception {
+ String query =
+ "SELECT " +
+ " t1.ua.DeviceClass AS DeviceClass,\n" +
+ " t1.ua.DeviceName AS DeviceName,\n" +
+ " t1.ua.DeviceBrand AS DeviceBrand,\n" +
+ " t1.ua.DeviceCpu AS DeviceCpu,\n" +
+ " t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" +
+ " t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" +
+ " t1.ua.OperatingSystemName AS OperatingSystemName,\n" +
+ " t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" +
+ " t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" +
+ " t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" +
+ " t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" +
+ " t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" +
+ " t1.ua.LayoutEngineName AS LayoutEngineName,\n" +
+ " t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" +
+ " t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" +
+ " t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" +
+ " t1.ua.LayoutEngineNameVersionMajor AS LayoutEngineNameVersionMajor,\n" +
+ " t1.ua.AgentClass AS AgentClass,\n" +
+ " t1.ua.AgentName AS AgentName,\n" +
+ " t1.ua.AgentVersion AS AgentVersion,\n" +
+ " t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" +
+ " t1.ua.AgentNameVersion AS AgentNameVersion,\n" +
+ " t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor\n" +
+ "FROM (" +
+ " SELECT" +
+ " parse_user_agent(" +
+ // NOTE: Here we do NOT say "User-Agent" --> It is just the first one in the list.
+ " 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'," +
+ " 'Sec-Ch-Ua', ''," +
+ " 'Sec-Ch-Ua-Arch', ''," +
+ " 'Sec-Ch-Ua-Bitness', ''," +
+ " 'Sec-Ch-Ua-Full-Version', ''," +
+ " 'Sec-Ch-Ua-Full-Version-List', ''," +
+ " 'Sec-Ch-Ua-Mobile', ''," +
+ " 'Sec-Ch-Ua-Model', ''," +
+ " 'Sec-Ch-Ua-Platform', ''," +
+ " 'Sec-Ch-Ua-Platform-Version', ''," +
+ " 'Sec-Ch-Ua-Wow64', ''" +
+ " ) AS ua " +
+ " FROM (values(1))" +
+ ") AS t1";
+
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineRecords(
+ Collections.singletonList(// Singleton list because we expect 1 record
+ expectations(
+ "DeviceClass", "Desktop",
+ "DeviceName", "Linux Desktop",
+ "DeviceBrand", "Unknown",
+ "DeviceCpu", "Intel x86_64",
+ "DeviceCpuBits", "64",
+ "OperatingSystemClass", "Desktop",
+ "OperatingSystemName", "Linux",
+ "OperatingSystemVersion", "??",
+ "OperatingSystemVersionMajor", "??",
+ "OperatingSystemNameVersion", "Linux ??",
+ "OperatingSystemNameVersionMajor", "Linux ??",
+ "LayoutEngineClass", "Browser",
+ "LayoutEngineName", "Blink",
+ "LayoutEngineVersion", "100.0",
+ "LayoutEngineVersionMajor", "100",
+ "LayoutEngineNameVersion", "Blink 100.0",
+ "LayoutEngineNameVersionMajor", "Blink 100",
+ "AgentClass", "Browser",
+ "AgentName", "Chrome",
+ "AgentVersion", "100.0.4896.127",
+ "AgentVersionMajor", "100",
+ "AgentNameVersion", "Chrome 100.0.4896.127",
+ "AgentNameVersionMajor", "Chrome 100"
+ )
+ )
+ )
+ .go();
+ }
+
+ /**
+ * Converts a more readable list of keys and values into what the ClusterTest supports.
+ * @param strings List of ["key", "value"]
+ * @return A Map of the same keys and values that is in the right format.
+ */
+ private Map<String, Object> expectations(String... strings) {
+ Map<String, Object> expectations = new LinkedHashMap<>();
+ int index = 0;
+ assertEquals("The number of arguments for 'expectations' must be even", 0, strings.length % 2);
+
+ while (index < strings.length) {
+ expectations.put(ExpressionStringBuilder.toString(parsePath(strings[index])), strings[index+1]);
+ index+=2;
+ }
+ return expectations;
+ }
+
}
diff --git a/pom.xml b/pom.xml
index bff9f8b6c7..10b692b8fe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -134,8 +134,9 @@
<xerces.version>2.12.2</xerces.version>
<commons.configuration.version>1.10</commons.configuration.version>
<commons.beanutils.version>1.9.4</commons.beanutils.version>
- <httpdlog-parser.version>5.7</httpdlog-parser.version>
- <yauaa.version>5.20</yauaa.version>
+ <httpdlog-parser.version>5.8</httpdlog-parser.version>
+ <yauaa.version>7.1.0</yauaa.version>
+ <log4j.version>2.17.2</log4j.version>
<aircompressor.version>0.20</aircompressor.version>
<iceberg.version>0.12.1</iceberg.version>
<univocity-parsers.version>2.8.3</univocity-parsers.version>