You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2014/03/27 23:49:11 UTC

git commit: [SPARK-1268] Adding XOR and AND-NOT operations to spark.util.collection.BitSet

Repository: spark
Updated Branches:
  refs/heads/master 53953d093 -> 6f986f0b8


[SPARK-1268] Adding XOR and AND-NOT operations to spark.util.collection.BitSet

Symmetric difference (xor) in particular is useful for computing some distance metrics (e.g. Hamming). Unit tests added.

Author: Petko Nikolov <ni...@soundcloud.com>

Closes #172 from petko-nikolov/bitset-imprv and squashes the following commits:

451f28b [Petko Nikolov] fixed style mistakes
5beba18 [Petko Nikolov] rm outer loop in andNot test
0e61035 [Petko Nikolov] conform to spark style; rm redundant asserts; more unit tests added; use arraycopy instead of loop
d53cdb9 [Petko Nikolov] rm incidentally added space
4e1df43 [Petko Nikolov] adding xor and and-not to BitSet; unit tests added


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f986f0b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f986f0b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f986f0b

Branch: refs/heads/master
Commit: 6f986f0b87bd03f4df2bf6c917e61241e9b14ac2
Parents: 53953d0
Author: Petko Nikolov <ni...@soundcloud.com>
Authored: Thu Mar 27 15:49:07 2014 -0700
Committer: Reynold Xin <rx...@apache.org>
Committed: Thu Mar 27 15:49:07 2014 -0700

----------------------------------------------------------------------
 .../apache/spark/util/collection/BitSet.scala   | 39 +++++++++
 .../spark/util/collection/BitSetSuite.scala     | 83 ++++++++++++++++++++
 2 files changed, 122 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6f986f0b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index d3153d2..af1f646 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -89,6 +89,45 @@ class BitSet(numBits: Int) extends Serializable {
   }
 
   /**
+   * Compute the symmetric difference by performing bit-wise XOR of the two sets returning the
+   * result.
+   */
+  def ^(other: BitSet): BitSet = {
+    val newBS = new BitSet(math.max(capacity, other.capacity))
+    val smaller = math.min(numWords, other.numWords)
+    var ind = 0
+    while (ind < smaller) {
+      newBS.words(ind) = words(ind) ^ other.words(ind)
+      ind += 1
+    }
+    if (ind < numWords) {
+      Array.copy( words, ind, newBS.words, ind, numWords - ind )
+    }
+    if (ind < other.numWords) {
+      Array.copy( other.words, ind, newBS.words, ind, other.numWords - ind )
+    }
+    newBS
+  }
+
+  /**
+   * Compute the difference of the two sets by performing bit-wise AND-NOT returning the
+   * result.
+   */
+  def andNot(other: BitSet): BitSet = {
+    val newBS = new BitSet(capacity)
+    val smaller = math.min(numWords, other.numWords)
+    var ind = 0
+    while (ind < smaller) {
+      newBS.words(ind) = words(ind) & ~other.words(ind)
+      ind += 1
+    }
+    if (ind < numWords) {
+      Array.copy( words, ind, newBS.words, ind, numWords - ind )
+    }
+    newBS
+  }
+
+  /**
    * Sets the bit at the specified index to true.
    * @param index the bit index
    */

http://git-wip-us.apache.org/repos/asf/spark/blob/6f986f0b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
index c32183c..b85a409 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/BitSetSuite.scala
@@ -69,4 +69,87 @@ class BitSetSuite extends FunSuite {
     assert(bitset.nextSetBit(96) === 96)
     assert(bitset.nextSetBit(97) === -1)
   }
+
+  test( "xor len(bitsetX) < len(bitsetY)" ) {
+    val setBitsX = Seq( 0, 2, 3, 37, 41 )
+    val setBitsY = Seq( 0, 1, 3, 37, 38, 41, 85)
+    val bitsetX = new BitSet(60)
+    setBitsX.foreach( i => bitsetX.set(i))
+    val bitsetY = new BitSet(100)
+    setBitsY.foreach( i => bitsetY.set(i))
+
+    val bitsetXor = bitsetX ^ bitsetY
+
+    assert(bitsetXor.nextSetBit(0) === 1)
+    assert(bitsetXor.nextSetBit(1) === 1)
+    assert(bitsetXor.nextSetBit(2) === 2)
+    assert(bitsetXor.nextSetBit(3) === 38)
+    assert(bitsetXor.nextSetBit(38) === 38)
+    assert(bitsetXor.nextSetBit(39) === 85)
+    assert(bitsetXor.nextSetBit(42) === 85)
+    assert(bitsetXor.nextSetBit(85) === 85)
+    assert(bitsetXor.nextSetBit(86) === -1)
+
+  }
+
+  test( "xor len(bitsetX) > len(bitsetY)" ) {
+    val setBitsX = Seq( 0, 1, 3, 37, 38, 41, 85)
+    val setBitsY   = Seq( 0, 2, 3, 37, 41 )
+    val bitsetX = new BitSet(100)
+    setBitsX.foreach( i => bitsetX.set(i))
+    val bitsetY = new BitSet(60)
+    setBitsY.foreach( i => bitsetY.set(i))
+
+    val bitsetXor = bitsetX ^ bitsetY
+
+    assert(bitsetXor.nextSetBit(0) === 1)
+    assert(bitsetXor.nextSetBit(1) === 1)
+    assert(bitsetXor.nextSetBit(2) === 2)
+    assert(bitsetXor.nextSetBit(3) === 38)
+    assert(bitsetXor.nextSetBit(38) === 38)
+    assert(bitsetXor.nextSetBit(39) === 85)
+    assert(bitsetXor.nextSetBit(42) === 85)
+    assert(bitsetXor.nextSetBit(85) === 85)
+    assert(bitsetXor.nextSetBit(86) === -1)
+
+  }
+
+  test( "andNot len(bitsetX) < len(bitsetY)" ) {
+    val setBitsX = Seq( 0, 2, 3, 37, 41, 48 )
+    val setBitsY = Seq( 0, 1, 3, 37, 38, 41, 85)
+    val bitsetX = new BitSet(60)
+    setBitsX.foreach( i => bitsetX.set(i))
+    val bitsetY = new BitSet(100)
+    setBitsY.foreach( i => bitsetY.set(i))
+
+    val bitsetDiff = bitsetX.andNot( bitsetY )
+
+    assert(bitsetDiff.nextSetBit(0) === 2)
+    assert(bitsetDiff.nextSetBit(1) === 2)
+    assert(bitsetDiff.nextSetBit(2) === 2)
+    assert(bitsetDiff.nextSetBit(3) === 48)
+    assert(bitsetDiff.nextSetBit(48) === 48)
+    assert(bitsetDiff.nextSetBit(49) === -1)
+    assert(bitsetDiff.nextSetBit(65) === -1)
+  }
+
+  test( "andNot len(bitsetX) > len(bitsetY)" ) {
+    val setBitsX = Seq( 0, 1, 3, 37, 38, 41, 85)
+    val setBitsY = Seq( 0, 2, 3, 37, 41, 48 )
+    val bitsetX = new BitSet(100)
+    setBitsX.foreach( i => bitsetX.set(i))
+    val bitsetY = new BitSet(60)
+    setBitsY.foreach( i => bitsetY.set(i))
+
+    val bitsetDiff = bitsetX.andNot( bitsetY )
+
+    assert(bitsetDiff.nextSetBit(0) === 1)
+    assert(bitsetDiff.nextSetBit(1) === 1)
+    assert(bitsetDiff.nextSetBit(2) === 38)
+    assert(bitsetDiff.nextSetBit(3) === 38)
+    assert(bitsetDiff.nextSetBit(38) === 38)
+    assert(bitsetDiff.nextSetBit(39) === 85)
+    assert(bitsetDiff.nextSetBit(85) === 85)
+    assert(bitsetDiff.nextSetBit(86) === -1)
+  }
 }