You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by jo...@apache.org on 2019/12/12 20:54:52 UTC
[incubator-druid] branch 0.17.0-incubating updated: DataSketches jars in core (#9003) (#9012)

This is an automated email from the ASF dual-hosted git repository.

jonwei pushed a commit to branch 0.17.0-incubating
in repository https://gitbox.apache.org/repos/asf/incubator-druid.git


The following commit(s) were added to refs/heads/0.17.0-incubating by this push:
     new 37df7fc  DataSketches jars in core (#9003) (#9012)
37df7fc is described below

commit 37df7fc8df13fa77425844140cd142c7831dacd0
Author: Chi Cao Minh <ch...@imply.io>
AuthorDate: Thu Dec 12 12:54:39 2019 -0800

    DataSketches jars in core (#9003) (#9012)
    
    Having DataSketches jars in core will allow potential improvements, for
    example:
    - Provide an alternative implementation of HLL:
      https://datasketches.github.io/docs/HLL/HllSketchVsDruidHyperLogLogCollector.html
    - Range partitioning for native parallel batch indexing without having
      the user load extensions on the classpath
    
    Dev mailing list discussion:
    https://lists.apache.org/thread.html/301410d71ff799cf616bf17c4ebcf9999fc30829f5fa62909f403e6c%40%3Cdev.druid.apache.org%3E
---
 core/pom.xml                                       | 16 ++++++++++++
 docs/ingestion/native-batch.md                     |  4 ---
 extensions-core/datasketches/pom.xml               |  2 ++
 .../parallel/ParallelIndexSupervisorTask.java      | 29 ++++------------------
 licenses.yaml                                      |  4 +--
 5 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 23361ff..5376a1a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -42,6 +42,22 @@
 
   <dependencies>
     <dependency>
+      <groupId>org.apache.datasketches</groupId>
+      <artifactId>datasketches-java</artifactId>
+      <scope>runtime</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.code.findbugs</groupId>
+          <artifactId>annotations</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.datasketches</groupId>
+      <artifactId>datasketches-memory</artifactId>
+      <scope>runtime</scope>
+    </dependency>
+    <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
     </dependency>
diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md
index f1af935..8934326 100644
--- a/docs/ingestion/native-batch.md
+++ b/docs/ingestion/native-batch.md
@@ -262,10 +262,6 @@ The three `partitionsSpec` types have different pros and cons:
 
 #### Single-dimension range partitioning
 
-> Single-dimension range partitioning currently requires the
-> [druid-datasketches](../development/extensions-core/datasketches-extension.md)
-> extension to be [loaded from the classpath](../development/extensions.md#loading-extensions-from-the-classpath).
-
 > Because single-range partitioning makes two passes over the input, the index task may fail if the input changes
 > in between the two passes. 
 
diff --git a/extensions-core/datasketches/pom.xml b/extensions-core/datasketches/pom.xml
index 4cb7131..2a93114 100644
--- a/extensions-core/datasketches/pom.xml
+++ b/extensions-core/datasketches/pom.xml
@@ -38,6 +38,7 @@
     <dependency>
       <groupId>org.apache.datasketches</groupId>
       <artifactId>datasketches-java</artifactId>
+      <scope>provided</scope>
       <exclusions>
         <exclusion>
           <groupId>com.google.code.findbugs</groupId>
@@ -48,6 +49,7 @@
     <dependency>
       <groupId>org.apache.datasketches</groupId>
       <artifactId>datasketches-memory</artifactId>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.calcite</groupId>
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
index db31af6..ed30714 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
@@ -61,7 +61,6 @@ import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRun
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries;
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution;
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger;
-import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch;
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger;
 import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.ISE;
@@ -376,32 +375,9 @@ public class ParallelIndexSupervisorTask extends AbstractBatchIndexTask implemen
   @Override
   public boolean isReady(TaskActionClient taskActionClient) throws Exception
   {
-    if (useRangePartitions()) {
-      assertDataSketchesAvailable();
-    }
     return determineLockGranularityAndTryLock(taskActionClient, ingestionSchema.getDataSchema().getGranularitySpec());
   }
 
-  private boolean useRangePartitions()
-  {
-    return (ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof SingleDimensionPartitionsSpec);
-  }
-
-  private static void assertDataSketchesAvailable()
-  {
-    try {
-      //noinspection ResultOfObjectAllocationIgnored
-      new StringSketch();
-    }
-    catch (NoClassDefFoundError e) {
-      throw new ISE(
-          e,
-          "DataSketches is unvailable."
-          + " Try loading the druid-datasketches extension from the classpath for the overlord and middleManagers/indexers."
-      );
-    }
-  }
-
   @Override
   public List<DataSegment> findSegmentsToLock(TaskActionClient taskActionClient, List<Interval> intervals)
       throws IOException
@@ -523,6 +499,11 @@ public class ParallelIndexSupervisorTask extends AbstractBatchIndexTask implemen
            && ingestionSchema.getTuningConfig().getMaxNumConcurrentSubTasks() >= minRequiredNumConcurrentSubTasks;
   }
 
+  private boolean useRangePartitions()
+  {
+    return ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof SingleDimensionPartitionsSpec;
+  }
+
   /**
    * Run the single phase parallel indexing for best-effort rollup. In this mode, each sub task created by
    * the supervisor task reads data and generates segments individually.
diff --git a/licenses.yaml b/licenses.yaml
index f375dce..c8f6df6 100644
--- a/licenses.yaml
+++ b/licenses.yaml
@@ -3051,7 +3051,7 @@ notices:
 
 name: DataSketches
 license_category: binary
-module: extensions/druid-datasketches
+module: java-core
 license_name: Apache License version 2.0
 version: 1.1.0-incubating
 libraries:
@@ -3061,7 +3061,7 @@ libraries:
 
 name: DataSketches
 license_category: binary
-module: extensions/druid-datasketches
+module: java-core
 license_name: Apache License version 2.0
 version: 1.2.0-incubating
 libraries:


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org