You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by jo...@apache.org on 2019/12/12 20:54:52 UTC
[incubator-druid] branch 0.17.0-incubating updated: DataSketches
jars in core (#9003) (#9012)
This is an automated email from the ASF dual-hosted git repository.
jonwei pushed a commit to branch 0.17.0-incubating
in repository https://gitbox.apache.org/repos/asf/incubator-druid.git
The following commit(s) were added to refs/heads/0.17.0-incubating by this push:
new 37df7fc DataSketches jars in core (#9003) (#9012)
37df7fc is described below
commit 37df7fc8df13fa77425844140cd142c7831dacd0
Author: Chi Cao Minh <ch...@imply.io>
AuthorDate: Thu Dec 12 12:54:39 2019 -0800
DataSketches jars in core (#9003) (#9012)
Having DataSketches jars in core will allow potential improvements, for
example:
- Provide an alternative implementation of HLL:
https://datasketches.github.io/docs/HLL/HllSketchVsDruidHyperLogLogCollector.html
- Range partitioning for native parallel batch indexing without having
the user load extensions on the classpath
Dev mailing list discussion:
https://lists.apache.org/thread.html/301410d71ff799cf616bf17c4ebcf9999fc30829f5fa62909f403e6c%40%3Cdev.druid.apache.org%3E
---
core/pom.xml | 16 ++++++++++++
docs/ingestion/native-batch.md | 4 ---
extensions-core/datasketches/pom.xml | 2 ++
.../parallel/ParallelIndexSupervisorTask.java | 29 ++++------------------
licenses.yaml | 4 +--
5 files changed, 25 insertions(+), 30 deletions(-)
diff --git a/core/pom.xml b/core/pom.xml
index 23361ff..5376a1a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -42,6 +42,22 @@
<dependencies>
<dependency>
+ <groupId>org.apache.datasketches</groupId>
+ <artifactId>datasketches-java</artifactId>
+ <scope>runtime</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>com.google.code.findbugs</groupId>
+ <artifactId>annotations</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.datasketches</groupId>
+ <artifactId>datasketches-memory</artifactId>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md
index f1af935..8934326 100644
--- a/docs/ingestion/native-batch.md
+++ b/docs/ingestion/native-batch.md
@@ -262,10 +262,6 @@ The three `partitionsSpec` types have different pros and cons:
#### Single-dimension range partitioning
-> Single-dimension range partitioning currently requires the
-> [druid-datasketches](../development/extensions-core/datasketches-extension.md)
-> extension to be [loaded from the classpath](../development/extensions.md#loading-extensions-from-the-classpath).
-
> Because single-range partitioning makes two passes over the input, the index task may fail if the input changes
> in between the two passes.
diff --git a/extensions-core/datasketches/pom.xml b/extensions-core/datasketches/pom.xml
index 4cb7131..2a93114 100644
--- a/extensions-core/datasketches/pom.xml
+++ b/extensions-core/datasketches/pom.xml
@@ -38,6 +38,7 @@
<dependency>
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-java</artifactId>
+ <scope>provided</scope>
<exclusions>
<exclusion>
<groupId>com.google.code.findbugs</groupId>
@@ -48,6 +49,7 @@
<dependency>
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-memory</artifactId>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.calcite</groupId>
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
index db31af6..ed30714 100644
--- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
+++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
@@ -61,7 +61,6 @@ import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTaskRun
import org.apache.druid.indexing.common.task.batch.parallel.distribution.PartitionBoundaries;
import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution;
import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger;
-import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketch;
import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
@@ -376,32 +375,9 @@ public class ParallelIndexSupervisorTask extends AbstractBatchIndexTask implemen
@Override
public boolean isReady(TaskActionClient taskActionClient) throws Exception
{
- if (useRangePartitions()) {
- assertDataSketchesAvailable();
- }
return determineLockGranularityAndTryLock(taskActionClient, ingestionSchema.getDataSchema().getGranularitySpec());
}
- private boolean useRangePartitions()
- {
- return (ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof SingleDimensionPartitionsSpec);
- }
-
- private static void assertDataSketchesAvailable()
- {
- try {
- //noinspection ResultOfObjectAllocationIgnored
- new StringSketch();
- }
- catch (NoClassDefFoundError e) {
- throw new ISE(
- e,
- "DataSketches is unvailable."
- + " Try loading the druid-datasketches extension from the classpath for the overlord and middleManagers/indexers."
- );
- }
- }
-
@Override
public List<DataSegment> findSegmentsToLock(TaskActionClient taskActionClient, List<Interval> intervals)
throws IOException
@@ -523,6 +499,11 @@ public class ParallelIndexSupervisorTask extends AbstractBatchIndexTask implemen
&& ingestionSchema.getTuningConfig().getMaxNumConcurrentSubTasks() >= minRequiredNumConcurrentSubTasks;
}
+ private boolean useRangePartitions()
+ {
+ return ingestionSchema.getTuningConfig().getGivenOrDefaultPartitionsSpec() instanceof SingleDimensionPartitionsSpec;
+ }
+
/**
* Run the single phase parallel indexing for best-effort rollup. In this mode, each sub task created by
* the supervisor task reads data and generates segments individually.
diff --git a/licenses.yaml b/licenses.yaml
index f375dce..c8f6df6 100644
--- a/licenses.yaml
+++ b/licenses.yaml
@@ -3051,7 +3051,7 @@ notices:
name: DataSketches
license_category: binary
-module: extensions/druid-datasketches
+module: java-core
license_name: Apache License version 2.0
version: 1.1.0-incubating
libraries:
@@ -3061,7 +3061,7 @@ libraries:
name: DataSketches
license_category: binary
-module: extensions/druid-datasketches
+module: java-core
license_name: Apache License version 2.0
version: 1.2.0-incubating
libraries:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org