You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/08/25 18:44:21 UTC

[incubator-datasketches-java] 02/05: Interim

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch RelativeErrorQuantiles
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit c403ee5ea0a2c1ad7662d2b5109ac66a300a92e3
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Thu Aug 13 16:13:21 2020 -0700

    Interim
---
 src/main/java/org/apache/datasketches/Util.java    |  26 ++++
 .../apache/datasketches/kll/RelativeCompactor.java | 110 ++++++++++++++++
 ...eFloatsSketch.java => RelativeErrorSketch.java} | 139 ++++++++++++---------
 .../kll/RelativeErrorSketchBuilder.java            | 129 +++++++++++++++++++
 .../apache/datasketches/kll/RelativeErrorUtil.java |  36 ++++++
 .../java/org/apache/datasketches/UtilTest.java     |  21 +++-
 6 files changed, 402 insertions(+), 59 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/Util.java b/src/main/java/org/apache/datasketches/Util.java
index fa5ecc8..08b85a1 100644
--- a/src/main/java/org/apache/datasketches/Util.java
+++ b/src/main/java/org/apache/datasketches/Util.java
@@ -381,6 +381,32 @@ public final class Util {
   //Powers of 2 related
 
   /**
+   * Returns the number of one bits following the lowest-order ("rightmost") zero-bit in the
+   * two's complement binary representation of the specified long value, or 64 if the value is equal
+   * to minus one.
+   * @param v the value whose number of trailing ones is to be computed.
+   * @return the number of one bits following the lowest-order ("rightmost") zero-bit in the
+   * two's complement binary representation of the specified long value, or 64 if the value is equal
+   * to minus one.
+   */
+  public static int numberOfTrailingOnes(final long v) {
+    return Long.numberOfTrailingZeros(~v);
+  }
+
+  /**
+   * Returns the number of one bits preceding the highest-order ("leftmost") one-bit in the
+   * two's complement binary representation of the specified long value, or 64 if the value is equal
+   * to minus one.
+   * @param v the value whose number of leading ones is to be computed.
+   * @return the number of one bits preceding the lowest-order ("rightmost") zero-bit in the
+   * two's complement binary representation of the specified long value, or 64 if the value is equal
+   * to minus one.
+   */
+  public static int numberOfLeadingOnes(final long v) {
+    return Long.numberOfLeadingZeros(~v);
+  }
+
+  /**
    * Returns true if argument is exactly a positive power of 2 and greater than zero.
    *
    * @param v The input argument.
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java b/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java
new file mode 100644
index 0000000..9d1b4be
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeCompactor.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+import static org.apache.datasketches.kll.RelativeErrorUtil.INIT_NUMBER_OF_SECTIONS;
+
+import java.util.Arrays;
+
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings({"javadoc","unused"})
+public class RelativeCompactor {
+  private int numCompactions = 0; //number of compaction operations performed
+  private int state = 0; //state of the deterministic compaction schedule
+  private int offset = 0; //0 or 1 uniformly at random in each compaction
+  private float[] buf;
+  private boolean sorted = false;
+  private int numValues = 0;
+
+  //extracted from constructor
+  private boolean alternate = true; //every other compaction has the opposite offset
+  private int sectionSize = 32;
+  private int numSections = INIT_NUMBER_OF_SECTIONS; //# of sections in the buffer
+  private int always = sectionSize;
+  private int never = sectionSize * numSections;
+  private boolean neverGrows = true;
+  private int height = 0;
+  private Schedule schedule = Schedule.DETERMINISTIC;
+
+  //derived
+  private float sectionSizeF = sectionSize;
+
+  //Empty Constructor, assume all defaults
+  public RelativeCompactor() { }
+
+  //Constructor
+  public RelativeCompactor(
+      final Schedule schedule,
+      final int sectionSize,
+      final int numSections,
+      final int always,
+      final int never,
+      final boolean neverGrows,
+      final int height,
+      final boolean alternate) {
+    this.schedule = schedule;
+    this.sectionSize = sectionSize;
+    this.numSections = numSections;
+    this.always = always;
+    this.never = never;
+    this.neverGrows = neverGrows;
+    this.height = height;
+    this.alternate = alternate;
+    final int cap = never + (numSections * sectionSize) + always;
+    buf = new float[cap];
+  }
+
+  public float[] getBuf() { return buf; }
+
+  public int getNumValues() { return numValues; }
+
+  public void incrementNumValus() { numValues++; }
+
+  public void compact() {
+    //assert
+  }
+
+  public int capacity() {
+    final int cap = never + (numSections * sectionSize) + always;
+    assert cap > 1;
+    return cap;
+  }
+
+  public int rank(final float value) { //one-based
+    sort();
+    int count = 0;
+    for (int i = 0; i < numValues; i++) {
+      if (buf[i] <= value) { count++; continue; }
+      break;
+    }
+    return count;
+  }
+
+  public void sort() {
+    if (!sorted) {
+      Arrays.sort(buf, 0, numValues);
+      sorted = true;
+    }
+  }
+}
diff --git a/src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
similarity index 56%
rename from src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java
rename to src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
index 760d061..82addb2 100644
--- a/src/main/java/org/apache/datasketches/kll/KllReFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketch.java
@@ -22,8 +22,12 @@ package org.apache.datasketches.kll;
 import static java.lang.Math.ceil;
 import static java.lang.Math.sqrt;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
 
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
 /**
  * Proof-of-concept code for paper "Relative Error Streaming Quantiles",
  * https://arxiv.org/abs/2004.01668.
@@ -55,7 +59,7 @@ import java.util.Random;
  * @author Lee Rhodes
  */
 @SuppressWarnings("unused")
-public class KllReFloatsSketch {
+public class RelativeErrorSketch {
 
   static {
     final double x = sqrt(2);
@@ -63,60 +67,48 @@ public class KllReFloatsSketch {
     final Random rand =  new Random();
   }
 
-  // Constants
-
-  private final static double SECTION_SIZE_SCALAR = 0.5;
-  private final static double NEVER_SIZE_SCALAR = 0.5;
-  private final static int INIT_NUMBER_OF_SECTIONS = 2;
-  private final static int SMALLEST_MEANINGFUL_SECTION_SIZE = 4;
-  private final static double DEFAULT_EPS = 0.01;
-  private final static double EPS_UPPER_BOUND = 0.1; //the sketch gives rather bad results for eps > 0.1
-
-  /**
-   * Schedule
-   * @author Lee Rhodes
-   */
-  @SuppressWarnings("javadoc")
-  public enum Schedule { DETERMINISTIC, RANDOMIZED }
+  //final class parameters
+  private final double eps; //default = DEFAULT_EPS = .01
+  private final Schedule schedule; //default = Schedule.DETERMINISTIC;
+  private final int initNumSections; //default = INIT_NUMBER_OF_SECTIONS;
+  private final boolean lazy; //default = true;
+  private final boolean alternate;//default = true;
+  private final boolean neverGrows; //default = false;
 
   //class variables
-  private double eps = DEFAULT_EPS;
-  private Schedule sch = Schedule.DETERMINISTIC;
-  private int always = -1;
-  private int never = -1;
-  private int sectionSize = -1;
-  private int initNumSections = INIT_NUMBER_OF_SECTIONS;
-  private boolean lazy = true;
-  private boolean alternate = true;
-  private boolean neverGrows = false;
-  private RelativeCompactor compactor = new RelativeCompactor();
-  private RelativeCompactor[] compactors = new RelativeCompactor[0];
+  private int always;
+  private int never; //the part that is never compacted. Default = sectionSize * numSections
+  private int sectionSize; //the number of sections & determined by eps
+
+  private List<RelativeCompactor> compactors = new ArrayList<>();
   private int H = 0;
   private int size = 0;
-
+  private int maxSize = 0;
 
   /**
    * Constructor.
-   * @param eps blah
-   * @param sch blah
-   * @param always blah
-   * @param never blah
-   * @param sectionSize blah
-   * @param initNumSections blah
-   * @param lazy blah
-   * @param alternate blah
+   * @param eps target error rate. Must be less than .1. Default = .01.
+   * @param sch schedule, whether deterministic or randomized. Default = deterministic.
+   * @param always the size of the buffer that is always compacted
+   * @param never the size of the buffer that is never compacted
+   * @param sectionSize the size of a section in the buffer
+   * @param initNumSections the initial number of sections. Default = 2.
+   * @param lazy if true, stop compressing after the first compressible candidate found.
+   * @param alternate if true, then random choice of odd/even done every other time.
+   * @param neverGrows if true, we do not let "never" grow.
    */
-  public KllReFloatsSketch(
+  RelativeErrorSketch(
       final double eps,
-      final Schedule sch,
+      final Schedule schedule,
       final int always,
       final int never,
       final int sectionSize,
       final int initNumSections,
       final boolean lazy,
-      final boolean alternate) {
+      final boolean alternate,
+      final boolean neverGrows) {
     this.eps = eps;
-    this.sch = sch;
+    this.schedule = schedule;
     this.always = always;
     this.never = never;
     this.sectionSize = sectionSize;
@@ -124,36 +116,67 @@ public class KllReFloatsSketch {
     this.initNumSections = initNumSections;
     this.lazy = lazy;
     this.alternate = alternate;
+    this.neverGrows = neverGrows;
 
-    // default setting of sectionSize, always, and never according to eps
-    if (this.sectionSize == -1) {
-      // ensured to be even and positive (thus >= 2)
-      this.sectionSize = (2 * (int)(SECTION_SIZE_SCALAR / eps)) + 1;
-    }
-    if (this.always == -1) { this.always = sectionSize; }
-
-    neverGrows = false; //if never is set by the user, then we do not let it grow
-    if (this.never == -1) {
-      this.never = (int)(NEVER_SIZE_SCALAR * this.sectionSize * this.initNumSections);
-      neverGrows = true;
-    }
-    compactors = new RelativeCompactor[0];
     H = 0;
     size = 0;
+    maxSize = 0;
     grow();
   }
 
   void grow() {
+    compactors.add(
+        new RelativeCompactor(schedule,
+                              sectionSize,
+                              initNumSections,
+                              always,
+                              never,
+                              neverGrows,
+                              H,
+                              alternate
+        ));
+    H = compactors.size();
+    updateMaxSize();
+  }
+
+  void updateMaxSize() {
+    int maxSize = 0;
+    for (RelativeCompactor c : compactors) { maxSize += c.capacity(); }
+    this.maxSize = maxSize;
+  }
+
+  void update(final float item) {
+    final RelativeCompactor c = compactors.get(0);
+
+  }
+
+  void compress(final boolean lazy) {
+
+  }
+
+  void mergeIntoSelf(final RelativeErrorSketch sketch) {
+
+  }
+
+  void merge(final RelativeErrorSketch sketch1, final RelativeErrorSketch sketch2) {
+
+  }
 
+  int rank(final float value) {
+    return 0;
   }
 
+  Pair[] cdf() {
+    return null;
+  }
 
-  class RelativeCompactor {
-    int numCompactions = 0;
+  Pair[] ranks() {
+    return null;
+  }
 
-    public RelativeCompactor() {
-      numCompactions = 0; // number of compaction operations performed
-    }
+  class Pair {
+    float rank;
+    float value;
   }
 
 }
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java
new file mode 100644
index 0000000..e7601af
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorSketchBuilder.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+import static org.apache.datasketches.kll.RelativeErrorUtil.DEFAULT_EPS;
+import static org.apache.datasketches.kll.RelativeErrorUtil.EPS_UPPER_BOUND;
+import static org.apache.datasketches.kll.RelativeErrorUtil.INIT_NUMBER_OF_SECTIONS;
+import static org.apache.datasketches.kll.RelativeErrorUtil.NEVER_SIZE_SCALAR;
+import static org.apache.datasketches.kll.RelativeErrorUtil.SECTION_SIZE_SCALAR;
+
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.kll.RelativeErrorUtil.Schedule;
+
+/**
+ * @author Lee Rhodes
+ */
+public class RelativeErrorSketchBuilder {
+  private double bEps = DEFAULT_EPS;
+  private Schedule bSchedule = Schedule.DETERMINISTIC;
+  private int bInitNumSections = INIT_NUMBER_OF_SECTIONS;
+  private boolean bLazy = true;
+  private boolean bAlternate = true;
+  private boolean bNeverGrows = false;
+  private int bAlways = -1;
+  private int bNever = -1;
+  private int bSectionSize = -1;
+
+  /**
+   * Default constructor
+   */
+  public RelativeErrorSketchBuilder() {}
+
+  /**
+   * Builds a RelativeErrorSketch
+   * @return a RelativeErrorSketch
+   */
+  public RelativeErrorSketch build() {
+    // default setting of sectionSize, always, and "never", according to eps
+    if (bSectionSize == -1) {
+      // ensured to be even and positive (thus >= 2)
+      bSectionSize = 2 * ((int)(SECTION_SIZE_SCALAR / bEps) + 1);
+    }
+    if (bAlways == -1) { bAlways = bSectionSize; }
+
+    bNeverGrows = false; //if never is set true by the user, then we do not let it grow
+    if (bNever == -1) {
+      bNever = (int)(NEVER_SIZE_SCALAR * bSectionSize * bInitNumSections);
+      bNeverGrows = true;
+    }
+    return new RelativeErrorSketch(bEps, bSchedule, bAlways, bNever, bSectionSize, bInitNumSections,
+        bLazy, bAlternate, bNeverGrows);
+  }
+
+  /**
+   * Set the target error rate. Must be less than .1. Default = .01.
+   * @param eps the target error rate
+   */
+  public void setEps(final double eps) {
+    if (eps > EPS_UPPER_BOUND) {
+      throw new SketchesArgumentException("eps must be at most " + EPS_UPPER_BOUND);
+    }
+    bEps = eps;
+  }
+
+  /**
+   * Set whether DETERMINISTIC, RANDOMIZED or RANDOMIZED_LINAR. Default = DETERMINISTIC.
+   * @param schedule whether DETERMINISTIC, RANDOMIZED or RANDOMIZED_LINAR.
+   */
+  public void setSchedule(final Schedule schedule) {
+    bSchedule = schedule;
+  }
+
+  /**
+   * Sets the initial number of sections. Default = 2.
+   * @param initNumSections the initial number of sections.
+   */
+  public void setInitNumSections(final int initNumSections) {
+    bInitNumSections = initNumSections;
+  }
+
+  /**
+   * Sets lazy.
+   * @param lazy if true, stop compressing after the first compressible candidate found.
+   */
+  public void setLazy(final boolean lazy) {
+    bLazy = lazy;
+  }
+
+  /**
+   * Sets alternate.
+   * @param alternate if true, then random choice of odd/even done every other time.
+   */
+  public void setAlternate(final boolean alternate) {
+    bAlternate = alternate;
+  }
+
+  /**
+   * Sets always
+   * @param always the size of the buffer that is always compacted
+   */
+  public void setAlways(final int always) {
+    bAlways = always;
+  }
+
+  /**
+   * Sets never
+   * @param never the size of the buffer that is never compacted
+   */
+  public void setNever(final int never) {
+    bNever = never;
+  }
+}
diff --git a/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java b/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java
new file mode 100644
index 0000000..7efb4a8
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/kll/RelativeErrorUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.kll;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings({"javadoc"})
+public class RelativeErrorUtil {
+  final static double SECTION_SIZE_SCALAR = 0.5;
+  final static double NEVER_SIZE_SCALAR = 0.5;
+  final static int INIT_NUMBER_OF_SECTIONS = 2;
+  final static int SMALLEST_MEANINGFUL_SECTION_SIZE = 4;
+  final static double DEFAULT_EPS = 0.01;
+  //the sketch gives rather bad results for eps > 0.1
+  final static double EPS_UPPER_BOUND = 0.1;
+
+  public enum Schedule { DETERMINISTIC, RANDOMIZED, RANDOMIZED_LINAR }
+}
diff --git a/src/test/java/org/apache/datasketches/UtilTest.java b/src/test/java/org/apache/datasketches/UtilTest.java
index 4ca1445..b84f55b 100644
--- a/src/test/java/org/apache/datasketches/UtilTest.java
+++ b/src/test/java/org/apache/datasketches/UtilTest.java
@@ -40,11 +40,14 @@ import static org.apache.datasketches.Util.isMultipleOf8AndGT0;
 import static org.apache.datasketches.Util.isPowerOf2;
 import static org.apache.datasketches.Util.milliSecToString;
 import static org.apache.datasketches.Util.nanoSecToString;
+import static org.apache.datasketches.Util.numberOfLeadingOnes;
+import static org.apache.datasketches.Util.numberOfTrailingOnes;
 import static org.apache.datasketches.Util.pwr2LawNext;
 import static org.apache.datasketches.Util.pwr2LawPrev;
 import static org.apache.datasketches.Util.pwrLawNextDouble;
 import static org.apache.datasketches.Util.simpleLog2OfLong;
 import static org.apache.datasketches.Util.zeroPad;
+import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertTrue;
 import static org.testng.Assert.fail;
 
@@ -65,6 +68,22 @@ public class UtilTest {
     checkIfPowerOf2(31, "31");
   }
 
+
+  @Test
+  public void numTrailingOnes() {
+    long mask = 1L;
+    for (int i = 0; i <= 64; i++) {
+      long v = ~mask & -1L;
+      mask <<= 1;
+      int numT1s = numberOfTrailingOnes(v);
+      int numL1s = numberOfLeadingOnes(v);
+      assertEquals(Long.numberOfTrailingZeros(~v), numT1s);
+      assertEquals(Long.numberOfLeadingZeros(~v), numL1s);
+      //println(zeroPad(Long.toBinaryString(v),64) + ", " + numL1s + ", " + numT1s);
+      continue;
+    }
+  }
+
   @Test
   public void checkIsPowerOf2() {
     Assert.assertEquals(isPowerOf2(0), false);
@@ -383,7 +402,7 @@ public class UtilTest {
    */
   static void print(final Object o) {
     if (o != null) {
-      //System.out.print(o.toString()); //disable here
+      System.out.print(o.toString()); //disable here
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org