You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/08/25 18:44:21 UTC
[incubator-datasketches-java] 02/05: Interim
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch RelativeErrorQuantiles
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
commit c403ee5ea0a2c1ad7662d2b5109ac66a300a92e3
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Thu Aug 13 16:13:21 2020 -0700
Interim
---
src/main/java/org/apache/datasketches/Util.java | 26 ++++
.../apache/datasketches/kll/RelativeCompactor.java | 110 ++++++++++++++++
...eFloatsSketch.java => RelativeErrorSketch.java} | 139 ++++++++++++---------
.../kll/RelativeErrorSketchBuilder.java | 129 +++++++++++++++++++
.../apache/datasketches/kll/RelativeErrorUtil.java | 36 ++++++
.../java/org/apache/datasketches/UtilTest.java | 21 +++-
6 files changed, 402 insertions(+), 59 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/Util.java b/src/main/java/org/apache/datasketches/Util.java
index fa5ecc8..08b85a1 100644
--- a/src/main/java/org/apache/datasketches/Util.java
+++ b/src/main/java/org/apache/datasketches/Util.java
@@ -381,6 +381,32 @@ public final class Util {
//Powers of 2 related
/**
+ * Returns the number of one bits following the lowest-order ("rightmost") zero-bit in the
+ * two's complement binary representation of the specified long value, or 64 if the value is equal
+ * to minus one.
+ * @param v the value whose number of trailing ones is to be computed.
+ * @return the number of one bits following the lowest-order ("rightmost") zero-bit in the
+ * two's complement binary representation of the specified long value, or 64 if the value is equal
+ * to minus one.
+ */
+ public static int numberOfTrailingOnes(final long v) {
+ return Long.numberOfTrailingZeros(~v);
+ }
+
+ /**
+ * Returns the number of one bits preceding the highest-order ("leftmost") one-bit in the
+ * two's complement binary representation of the specified long value, or 64 if the value is equal
+ * to minus one.
+ * @param v the value whose number of leading ones is to be computed.
+ * @return the number of one bits preceding the lowest-order ("rightmost") zero-bit in the
+ * two's complement binary representation of the specified long value, or 64 if the value is equal
+ * to minus one.
+ */
+ public static int numberOfLeadingOnes(final long v) {
+ return Long.numberOfLeadingZeros(~v);
+ }
+
+ /**
* Returns true if argument is exactly a positive power of 2 and greater than zero.
*
* @param v The input argument.
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java b/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java
new file mode 100644
index 0000000..9d1b4be
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+import static org.apache.datasketches.kll.RelativeErrorUtil.INIT_NUMBER_OF_SECTIONS;
+
+import java.util.Arrays;
+
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings({"javadoc","unused"})
+public class RelativeCompactor {
+ private int numCompactions = 0; //number of compaction operations performed
+ private int state = 0; //state of the deterministic compaction schedule
+ private int offset = 0; //0 or 1 uniformly at random in each compaction
+ private float[] buf;
+ private boolean sorted = false;
+ private int numValues = 0;
+
+ //extracted from constructor
+ private boolean alternate = true; //every other compaction has the opposite offset
+ private int sectionSize = 32;
+ private int numSections = INIT_NUMBER_OF_SECTIONS; //# of sections in the buffer
+ private int always = sectionSize;
+ private int never = sectionSize * numSections;
+ private boolean neverGrows = true;
+ private int height = 0;
+ private Schedule schedule = Schedule.DETERMINISTIC;
+
+ //derived
+ private float sectionSizeF = sectionSize;
+
+ //Empty Constructor, assume all defaults
+ public RelativeCompactor() { }
+
+ //Constructor
+ public RelativeCompactor(
+ final Schedule schedule,
+ final int sectionSize,
+ final int numSections,
+ final int always,
+ final int never,
+ final boolean neverGrows,
+ final int height,
+ final boolean alternate) {
+ this.schedule = schedule;
+ this.sectionSize = sectionSize;
+ this.numSections = numSections;
+ this.always = always;
+ this.never = never;
+ this.neverGrows = neverGrows;
+ this.height = height;
+ this.alternate = alternate;
+ final int cap = never + (numSections * sectionSize) + always;
+ buf = new float[cap];
+ }
+
+ public float[] getBuf() { return buf; }
+
+ public int getNumValues() { return numValues; }
+
+ public void incrementNumValus() { numValues++; }
+
+ public void compact() {
+ //assert
+ }
+
+ public int capacity() {
+ final int cap = never + (numSections * sectionSize) + always;
+ assert cap > 1;
+ return cap;
+ }
+
+ public int rank(final float value) { //one-based
+ sort();
+ int count = 0;
+ for (int i = 0; i < numValues; i++) {
+ if (buf[i] <= value) { count++; continue; }
+ break;
+ }
+ return count;
+ }
+
+ public void sort() {
+ if (!sorted) {
+ Arrays.sort(buf, 0, numValues);
+ sorted = true;
+ }
+ }
+}
diff --git a/src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
similarity index 56%
rename from src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java
rename to src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
index 760d061..82addb2 100644
--- a/src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
@@ -22,8 +22,12 @@ package org.apache.datasketches.kll;
import static java.lang.Math.ceil;
import static java.lang.Math.sqrt;
+import java.util.ArrayList;
+import java.util.List;
import java.util.Random;
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
/**
* Proof-of-concept code for paper "Relative Error Streaming Quantiles",
* https://arxiv.org/abs/2004.01668.
@@ -55,7 +59,7 @@ import java.util.Random;
* @author Lee Rhodes
*/
@SuppressWarnings("unused")
-public class KllReFloatsSketch {
+public class RelativeErrorSketch {
static {
final double x = sqrt(2);
@@ -63,60 +67,48 @@ public class KllReFloatsSketch {
final Random rand = new Random();
}
- // Constants
-
- private final static double SECTION_SIZE_SCALAR = 0.5;
- private final static double NEVER_SIZE_SCALAR = 0.5;
- private final static int INIT_NUMBER_OF_SECTIONS = 2;
- private final static int SMALLEST_MEANINGFUL_SECTION_SIZE = 4;
- private final static double DEFAULT_EPS = 0.01;
- private final static double EPS_UPPER_BOUND = 0.1; //the sketch gives rather bad results for eps > 0.1
-
- /**
- * Schedule
- * @author Lee Rhodes
- */
- @SuppressWarnings("javadoc")
- public enum Schedule { DETERMINISTIC, RANDOMIZED }
+ //final class parameters
+ private final double eps; //default = DEFAULT_EPS = .01
+ private final Schedule schedule; //default = Schedule.DETERMINISTIC;
+ private final int initNumSections; //default = INIT_NUMBER_OF_SECTIONS;
+ private final boolean lazy; //default = true;
+ private final boolean alternate;//default = true;
+ private final boolean neverGrows; //default = false;
//class variables
- private double eps = DEFAULT_EPS;
- private Schedule sch = Schedule.DETERMINISTIC;
- private int always = -1;
- private int never = -1;
- private int sectionSize = -1;
- private int initNumSections = INIT_NUMBER_OF_SECTIONS;
- private boolean lazy = true;
- private boolean alternate = true;
- private boolean neverGrows = false;
- private RelativeCompactor compactor = new RelativeCompactor();
- private RelativeCompactor[] compactors = new RelativeCompactor[0];
+ private int always;
+ private int never; //the part that is never compacted. Default = sectionSize * numSections
+ private int sectionSize; //the number of sections & determined by eps
+
+ private List<RelativeCompactor> compactors = new ArrayList<>();
private int H = 0;
private int size = 0;
-
+ private int maxSize = 0;
/**
* Constructor.
- * @param eps blah
- * @param sch blah
- * @param always blah
- * @param never blah
- * @param sectionSize blah
- * @param initNumSections blah
- * @param lazy blah
- * @param alternate blah
+ * @param eps target error rate. Must be less than .1. Default = .01.
+ * @param sch schedule, whether deterministic or randomized. Default = deterministic.
+ * @param always the size of the buffer that is always compacted
+ * @param never the size of the buffer that is never compacted
+ * @param sectionSize the size of a section in the buffer
+ * @param initNumSections the initial number of sections. Default = 2.
+ * @param lazy if true, stop compressing after the first compressible candidate found.
+ * @param alternate if true, then random choice of odd/even done every other time.
+ * @param neverGrows if true, we do not let "never" grow.
*/
- public KllReFloatsSketch(
+ RelativeErrorSketch(
final double eps,
- final Schedule sch,
+ final Schedule schedule,
final int always,
final int never,
final int sectionSize,
final int initNumSections,
final boolean lazy,
- final boolean alternate) {
+ final boolean alternate,
+ final boolean neverGrows) {
this.eps = eps;
- this.sch = sch;
+ this.schedule = schedule;
this.always = always;
this.never = never;
this.sectionSize = sectionSize;
@@ -124,36 +116,67 @@ public class KllReFloatsSketch {
this.initNumSections = initNumSections;
this.lazy = lazy;
this.alternate = alternate;
+ this.neverGrows = neverGrows;
- // default setting of sectionSize, always, and never according to eps
- if (this.sectionSize == -1) {
- // ensured to be even and positive (thus >= 2)
- this.sectionSize = (2 * (int)(SECTION_SIZE_SCALAR / eps)) + 1;
- }
- if (this.always == -1) { this.always = sectionSize; }
-
- neverGrows = false; //if never is set by the user, then we do not let it grow
- if (this.never == -1) {
- this.never = (int)(NEVER_SIZE_SCALAR * this.sectionSize * this.initNumSections);
- neverGrows = true;
- }
- compactors = new RelativeCompactor[0];
H = 0;
size = 0;
+ maxSize = 0;
grow();
}
void grow() {
+ compactors.add(
+ new RelativeCompactor(schedule,
+ sectionSize,
+ initNumSections,
+ always,
+ never,
+ neverGrows,
+ H,
+ alternate
+ ));
+ H = compactors.size();
+ updateMaxSize();
+ }
+
+ void updateMaxSize() {
+ int maxSize = 0;
+ for (RelativeCompactor c : compactors) { maxSize += c.capacity(); }
+ this.maxSize = maxSize;
+ }
+
+ void update(final float item) {
+ final RelativeCompactor c = compactors.get(0);
+
+ }
+
+ void compress(final boolean lazy) {
+
+ }
+
+ void mergeIntoSelf(final RelativeErrorSketch sketch) {
+
+ }
+
+ void merge(final RelativeErrorSketch sketch1, final RelativeErrorSketch sketch2) {
+
+ }
+ int rank(final float value) {
+ return 0;
}
+ Pair[] cdf() {
+ return null;
+ }
- class RelativeCompactor {
- int numCompactions = 0;
+ Pair[] ranks() {
+ return null;
+ }
- public RelativeCompactor() {
- numCompactions = 0; // number of compaction operations performed
- }
+ class Pair {
+ float rank;
+ float value;
}
}
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java
new file mode 100644
index 0000000..e7601af
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+import static org.apache.datasketches.kll.RelativeErrorUtil.DEFAULT_EPS;
+import static org.apache.datasketches.kll.RelativeErrorUtil.EPS_UPPER_BOUND;
+import static org.apache.datasketches.kll.RelativeErrorUtil.INIT_NUMBER_OF_SECTIONS;
+import static org.apache.datasketches.kll.RelativeErrorUtil.NEVER_SIZE_SCALAR;
+import static org.apache.datasketches.kll.RelativeErrorUtil.SECTION_SIZE_SCALAR;
+
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
+/**
+ * @author Lee Rhodes
+ */
+public class RelativeErrorSketchBuilder {
+ private double bEps = DEFAULT_EPS;
+ private Schedule bSchedule = Schedule.DETERMINISTIC;
+ private int bInitNumSections = INIT_NUMBER_OF_SECTIONS;
+ private boolean bLazy = true;
+ private boolean bAlternate = true;
+ private boolean bNeverGrows = false;
+ private int bAlways = -1;
+ private int bNever = -1;
+ private int bSectionSize = -1;
+
+ /**
+ * Default constructor
+ */
+ public RelativeErrorSketchBuilder() {}
+
+ /**
+ * Builds a RelativeErrorSketch
+ * @return a RelativeErrorSketch
+ */
+ public RelativeErrorSketch build() {
+ // default setting of sectionSize, always, and "never", according to eps
+ if (bSectionSize == -1) {
+ // ensured to be even and positive (thus >= 2)
+ bSectionSize = 2 * ((int)(SECTION_SIZE_SCALAR / bEps) + 1);
+ }
+ if (bAlways == -1) { bAlways = bSectionSize; }
+
+ bNeverGrows = false; //if never is set true by the user, then we do not let it grow
+ if (bNever == -1) {
+ bNever = (int)(NEVER_SIZE_SCALAR * bSectionSize * bInitNumSections);
+ bNeverGrows = true;
+ }
+ return new RelativeErrorSketch(bEps, bSchedule, bAlways, bNever, bSectionSize, bInitNumSections,
+ bLazy, bAlternate, bNeverGrows);
+ }
+
+ /**
+ * Set the target error rate. Must be less than .1. Default = .01.
+ * @param eps the target error rate
+ */
+ public void setEps(final double eps) {
+ if (eps > EPS_UPPER_BOUND) {
+ throw new SketchesArgumentException("eps must be at most " + EPS_UPPER_BOUND);
+ }
+ bEps = eps;
+ }
+
+ /**
+ * Set whether DETERMINISTIC, RANDOMIZED or RANDOMIZED_LINAR. Default = DETERMINISTIC.
+ * @param schedule whether DETERMINISTIC, RANDOMIZED or RANDOMIZED_LINAR.
+ */
+ public void setSchedule(final Schedule schedule) {
+ bSchedule = schedule;
+ }
+
+ /**
+ * Sets the initial number of sections. Default = 2.
+ * @param initNumSections the initial number of sections.
+ */
+ public void setInitNumSections(final int initNumSections) {
+ bInitNumSections = initNumSections;
+ }
+
+ /**
+ * Sets lazy.
+ * @param lazy if true, stop compressing after the first compressible candidate found.
+ */
+ public void setLazy(final boolean lazy) {
+ bLazy = lazy;
+ }
+
+ /**
+ * Sets alternate.
+ * @param alternate if true, then random choice of odd/even done every other time.
+ */
+ public void setAlternate(final boolean alternate) {
+ bAlternate = alternate;
+ }
+
+ /**
+ * Sets always
+ * @param always the size of the buffer that is always compacted
+ */
+ public void setAlways(final int always) {
+ bAlways = always;
+ }
+
+ /**
+ * Sets never
+ * @param never the size of the buffer that is never compacted
+ */
+ public void setNever(final int never) {
+ bNever = never;
+ }
+}
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java
new file mode 100644
index 0000000..7efb4a8
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings({"javadoc"})
+public class RelativeErrorUtil {
+ final static double SECTION_SIZE_SCALAR = 0.5;
+ final static double NEVER_SIZE_SCALAR = 0.5;
+ final static int INIT_NUMBER_OF_SECTIONS = 2;
+ final static int SMALLEST_MEANINGFUL_SECTION_SIZE = 4;
+ final static double DEFAULT_EPS = 0.01;
+ //the sketch gives rather bad results for eps > 0.1
+ final static double EPS_UPPER_BOUND = 0.1;
+
+ public enum Schedule { DETERMINISTIC, RANDOMIZED, RANDOMIZED_LINAR }
+}
diff --git a/src/test/java/org/apache/datasketches/UtilTest.java b/src/test/java/org/apache/datasketches/UtilTest.java
index 4ca1445..b84f55b 100644
--- a/src/test/java/org/apache/datasketches/UtilTest.java
+++ b/src/test/java/org/apache/datasketches/UtilTest.java
@@ -40,11 +40,14 @@ import static org.apache.datasketches.Util.isMultipleOf8AndGT0;
import static org.apache.datasketches.Util.isPowerOf2;
import static org.apache.datasketches.Util.milliSecToString;
import static org.apache.datasketches.Util.nanoSecToString;
+import static org.apache.datasketches.Util.numberOfLeadingOnes;
+import static org.apache.datasketches.Util.numberOfTrailingOnes;
import static org.apache.datasketches.Util.pwr2LawNext;
import static org.apache.datasketches.Util.pwr2LawPrev;
import static org.apache.datasketches.Util.pwrLawNextDouble;
import static org.apache.datasketches.Util.simpleLog2OfLong;
import static org.apache.datasketches.Util.zeroPad;
+import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import static org.testng.Assert.fail;
@@ -65,6 +68,22 @@ public class UtilTest {
checkIfPowerOf2(31, "31");
}
+
+ @Test
+ public void numTrailingOnes() {
+ long mask = 1L;
+ for (int i = 0; i <= 64; i++) {
+ long v = ~mask & -1L;
+ mask <<= 1;
+ int numT1s = numberOfTrailingOnes(v);
+ int numL1s = numberOfLeadingOnes(v);
+ assertEquals(Long.numberOfTrailingZeros(~v), numT1s);
+ assertEquals(Long.numberOfLeadingZeros(~v), numL1s);
+ //println(zeroPad(Long.toBinaryString(v),64) + ", " + numL1s + ", " + numT1s);
+ continue;
+ }
+ }
+
@Test
public void checkIsPowerOf2() {
Assert.assertEquals(isPowerOf2(0), false);
@@ -383,7 +402,7 @@ public class UtilTest {
*/
static void print(final Object o) {
if (o != null) {
- //System.out.print(o.toString()); //disable here
+ System.out.print(o.toString()); //disable here
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org