You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2014/10/19 18:21:14 UTC

[1/2] git commit: refs/heads/master - Optimize and fix encoding of similarity values

Repository: lucy
Updated Branches:
  refs/heads/master 6e1fe3a73 -> d4a1d737c


Optimize and fix encoding of similarity values

The previous code was a bit unclear as it extracted the LSB of the
exponent as part of the mantissa. This did actually work because
exponent and mantissa were stored next to each other like in the IEEE
format. For the same reason, the code can be optimized to use a single
shift and mask operation. The actual encoding does not change.

More importantly, this commit fixes a bug where, due to a missing range
check, values smaller than 2^-31 were encoded as 255 and subsequently
decoded as approximately 2^31. Such small values are probably rare but
this could cause significant errors when calculating similarities.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/3932c541
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/3932c541
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/3932c541

Branch: refs/heads/master
Commit: 3932c5411070f2781af9a2e8ab73a748adbde2ce
Parents: 6e1fe3a
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sun Oct 19 17:50:25 2014 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Sun Oct 19 18:18:23 2014 +0200

----------------------------------------------------------------------
 core/Lucy/Index/Similarity.c   | 26 +++++++++++++++++++-------
 core/Lucy/Index/Similarity.cfh |  2 +-
 perl/t/504-similarity.t        |  7 ++++++-
 3 files changed, 26 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/3932c541/core/Lucy/Index/Similarity.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Similarity.c b/core/Lucy/Index/Similarity.c
index cedb6e6..cceffa8 100644
--- a/core/Lucy/Index/Similarity.c
+++ b/core/Lucy/Index/Similarity.c
@@ -31,6 +31,12 @@
 #include "Lucy/Store/OutStream.h"
 #include "Lucy/Util/Freezer.h"
 
+// The exponent range [-31;32] is mapped to [0;63]. Values outside
+// of the range are clamped resulting in 6 bits for the exponent.
+// The IEEE bias is 127, so we have to subtract 127 and add 31 to
+// the upper bits.
+#define EXP_OFFSET ((127 - 31) << 2)
+
 Similarity*
 Sim_new() {
     Similarity *self = (Similarity*)Class_Make_Obj(SIMILARITY);
@@ -157,15 +163,21 @@ Sim_Encode_Norm_IMP(Similarity *self, float f) {
     }
     else {
         const uint32_t bits = *(uint32_t*)&f;
-        uint32_t mantissa   = (bits & 0xffffff) >> 21;
-        uint32_t exponent   = (((bits >> 24) & 0x7f) - 63) + 15;
 
-        if (exponent > 31) {
-            exponent = 31;
-            mantissa = 7;
-        }
+        // The normalized value contains two bits of mantissa (excluding
+        // the implicit leading bit) in the least significant bits and the
+        // exponent in the upper bits.
+        norm = (bits >> 21) & 0x3ff;
 
-        norm = (exponent << 3) | mantissa;
+        if (norm <= EXP_OFFSET) {
+            norm = 0;
+        }
+        else {
+            norm -= EXP_OFFSET;
+            if (norm > 255) {
+                norm = 255;
+            }
+        }
     }
 
     return norm;

http://git-wip-us.apache.org/repos/asf/lucy/blob/3932c541/core/Lucy/Index/Similarity.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Similarity.cfh b/core/Lucy/Index/Similarity.cfh
index 4798fb3..ad07f08 100644
--- a/core/Lucy/Index/Similarity.cfh
+++ b/core/Lucy/Index/Similarity.cfh
@@ -105,7 +105,7 @@ public class Lucy::Index::Similarity nickname Sim inherits Clownfish::Obj {
     Query_Norm(Similarity *self, float sum_of_squared_weights);
 
     /** encode_norm and decode_norm encode and decode between 32-bit IEEE
-     * floating point numbers and a 5-bit exponent, 3-bit mantissa float.  The
+     * floating point numbers and a 6-bit exponent, 3-bit mantissa float.  The
      * range covered by the single-byte encoding is 7x10^9 to 2x10^-9.  The
      * accuracy is about one significant decimal digit.
      */

http://git-wip-us.apache.org/repos/asf/lucy/blob/3932c541/perl/t/504-similarity.t
----------------------------------------------------------------------
diff --git a/perl/t/504-similarity.t b/perl/t/504-similarity.t
index a4d8666..f2f5afe 100644
--- a/perl/t/504-similarity.t
+++ b/perl/t/504-similarity.t
@@ -38,7 +38,7 @@ sub new {
 }
 
 package main;
-use Test::More tests => 9;
+use Test::More tests => 10;
 use Lucy::Test;
 use bytes;
 no bytes;
@@ -79,6 +79,11 @@ for ( 0 .. 255 ) {
 is_deeply( \@transformed, \@floats,
     "using the norm_decoder produces desired results" );
 
+my $small_encoded = $sim->encode_norm(1e-30);
+my $large_encoded = $sim->encode_norm(1e30);
+ok( $small_encoded != $large_encoded,
+    "extremely small and large values are encoded differently" );
+
 my $folder  = Lucy::Store::RAMFolder->new;
 my $indexer = Lucy::Index::Indexer->new(
     index  => $folder,


[2/2] git commit: refs/heads/master - Optimize decoding of similarity values

Posted by nw...@apache.org.
Optimize decoding of similarity values

Decoding of similarity values can be optimized in a similar way.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/d4a1d737
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/d4a1d737
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/d4a1d737

Branch: refs/heads/master
Commit: d4a1d737cce3cc356465c8a7b0f786843b8324b7
Parents: 3932c54
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sun Oct 19 18:03:50 2014 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Sun Oct 19 18:19:39 2014 +0200

----------------------------------------------------------------------
 core/Lucy/Index/Similarity.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/d4a1d737/core/Lucy/Index/Similarity.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Similarity.c b/core/Lucy/Index/Similarity.c
index cceffa8..8696a79 100644
--- a/core/Lucy/Index/Similarity.c
+++ b/core/Lucy/Index/Similarity.c
@@ -193,9 +193,7 @@ Sim_Decode_Norm_IMP(Similarity *self, uint32_t input) {
         result = 0;
     }
     else {
-        const uint32_t mantissa = byte & 7;
-        const uint32_t exponent = (byte >> 3) & 31;
-        result = ((exponent + (63 - 15)) << 24) | (mantissa << 21);
+        result = (input + EXP_OFFSET) << 21;
     }
 
     return *(float*)&result;