You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/06/02 02:06:20 UTC

[incubator-datasketches-hive] branch master updated: Fix theta sketch estimation if input has extra bytes

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-hive.git


The following commit(s) were added to refs/heads/master by this push:
     new c3a8689  Fix theta sketch estimation if input has extra bytes
     new 0e901d0  Merge pull request #51 from koke/theta-sketch-estimate-fix-byte-length
c3a8689 is described below

commit c3a86896f4b05b48b2e85086af482e4a147663ef
Author: Jorge Bernal <jb...@gmail.com>
AuthorDate: Fri May 29 12:30:44 2020 +0200

    Fix theta sketch estimation if input has extra bytes
    
    In some cases, the bytes in BytesWritable will contain some extra space, but
    getLength will give the right amount of bytes to read.
---
 .../datasketches/hive/theta/EstimateSketchUDF.java   |  3 ++-
 .../hive/theta/EstimateSketchUDFTest.java            | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchUDF.java b/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchUDF.java
index dd4c1a7..058f875 100644
--- a/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchUDF.java
@@ -56,7 +56,8 @@ public class EstimateSketchUDF extends UDF {
       return 0.0;
     }
 
-    final byte[] serializedSketch = binarySketch.getBytes();
+    final byte[] serializedSketch = new byte[binarySketch.getLength()];
+    System.arraycopy(binarySketch.getBytes(), 0, serializedSketch, 0, binarySketch.getLength());
 
     if (serializedSketch.length <= EMPTY_SKETCH_SIZE_BYTES) {
       return 0.0;
diff --git a/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchUDFTest.java
index 3e2811f..d538afa 100644
--- a/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchUDFTest.java
@@ -97,4 +97,24 @@ public class EstimateSketchUDFTest {
     assertEquals(128.0, testResult);
   }
 
+  @Test
+  public void evaluateRespectsByteLength() {
+    // In some instances, the BytesWritable buffer returned by getBytes() might be larger than the actual sketch bytes.
+    // getLength() should give the correct length to use.
+    //
+    // https://github.com/apache/incubator-datasketches-hive/issues/50
+
+    byte[] inputBytes = new byte[]{
+            (byte) 0x01, (byte) 0x03, (byte) 0x03, (byte) 0x00,
+            (byte) 0x00, (byte) 0x3a, (byte) 0xcc, (byte) 0x93,
+            (byte) 0x15, (byte) 0xf9, (byte) 0x7d, (byte) 0xcb,
+            (byte) 0xbd, (byte) 0x86, (byte) 0xa1, (byte) 0x05,
+            (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00,
+            (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00
+    };
+    BytesWritable input = new BytesWritable(inputBytes, 16);
+    EstimateSketchUDF estimate = new EstimateSketchUDF();
+    Double testResult = estimate.evaluate(input);
+    assertEquals(1.0, testResult, 0.0);
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org