You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2014/12/19 12:25:30 UTC

svn commit: r1646675 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/search/ lucene/core/src/java/org/apache/lucene/util/ lucene/core/src/test/org/apache/lucene/util/

Author: jpountz
Date: Fri Dec 19 11:25:30 2014
New Revision: 1646675

URL: http://svn.apache.org/r1646675
Log:
LUCENE-6118: Improve the efficiency of the history structure for recently-used filters.

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java?rev=1646675&r1=1646674&r2=1646675&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java Fri Dec 19 11:25:30 2014
@@ -36,6 +36,9 @@ import org.apache.lucene.util.FrequencyT
  */
 public final class UsageTrackingFilterCachingPolicy implements FilterCachingPolicy {
 
+  // the hash code that we use as a sentinel in the ring buffer.
+  private static final int SENTINEL = Integer.MIN_VALUE;
+
   static boolean isCostly(Filter filter) {
     // This does not measure the cost of iterating over the filter (for this we
     // already have the DocIdSetIterator#cost API) but the cost to build the
@@ -51,7 +54,7 @@ public final class UsageTrackingFilterCa
   }
 
   private final FilterCachingPolicy.CacheOnLargeSegments segmentPolicy;
-  private final FrequencyTrackingRingBuffer<Integer> recentlyUsedFilters;
+  private final FrequencyTrackingRingBuffer recentlyUsedFilters;
   private final int minFrequencyCostlyFilters;
   private final int minFrequencyCheapFilters;
   private final int minFrequencyOtherFilters;
@@ -96,7 +99,7 @@ public final class UsageTrackingFilterCa
     if (minFrequencyCheapFilters > historySize || minFrequencyCostlyFilters > historySize || minFrequencyOtherFilters > historySize) {
       throw new IllegalArgumentException("The minimum frequencies should be less than the size of the history of filters that are being tracked");
     }
-    this.recentlyUsedFilters = new FrequencyTrackingRingBuffer<>(historySize);
+    this.recentlyUsedFilters = new FrequencyTrackingRingBuffer(historySize, SENTINEL);
     this.minFrequencyCostlyFilters = minFrequencyCostlyFilters;
     this.minFrequencyCheapFilters = minFrequencyCheapFilters;
     this.minFrequencyOtherFilters = minFrequencyOtherFilters;
@@ -104,9 +107,10 @@ public final class UsageTrackingFilterCa
 
   @Override
   public void onUse(Filter filter) {
-    // Using the filter hash codes might help keep memory usage a bit lower
-    // since some filters might have non-negligible memory usage?
-    recentlyUsedFilters.add(filter.hashCode());
+    // we only track hash codes, which
+    synchronized (this) {
+      recentlyUsedFilters.add(filter.hashCode());
+    }
   }
 
   @Override
@@ -114,7 +118,10 @@ public final class UsageTrackingFilterCa
     if (segmentPolicy.shouldCache(filter, context, set) == false) {
       return false;
     }
-    final int frequency = recentlyUsedFilters.frequency(filter.hashCode());
+    final int frequency;
+    synchronized (this) {
+      frequency = recentlyUsedFilters.frequency(filter.hashCode());
+    }
     if (frequency >= minFrequencyOtherFilters) {
       return true;
     } else if (isCostly(filter) && frequency >= minFrequencyCostlyFilters) {

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java?rev=1646675&r1=1646674&r2=1646675&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java Fri Dec 19 11:25:30 2014
@@ -17,75 +17,213 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
-import java.util.ArrayDeque;
+import java.util.Arrays;
 import java.util.Collections;
-import java.util.Deque;
+import java.util.HashMap;
 import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
 
 /**
- * A ring buffer that tracks the frequency of the items that it contains.
- * This is typically useful to track popular recently-used items.
+ * A ring buffer that tracks the frequency of the integers that it contains.
+ * This is typically useful to track the hash codes of popular recently-used
+ * items.
  *
- * This class is thread-safe.
+ * This data-structure requires 22 bytes per entry on average (between 16 and
+ * 28).
  *
  * @lucene.internal
  */
-public final class FrequencyTrackingRingBuffer<T> {
+public final class FrequencyTrackingRingBuffer implements Accountable {
 
-  private final int maxSize;
-  private final Deque<T> ringBuffer;
-  private final ConcurrentMap<T, Integer> frequencies;
+  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FrequencyTrackingRingBuffer.class);
 
-  /** Create a new ring buffer that will contain at most <code>size</code> items. */
-  public FrequencyTrackingRingBuffer(int maxSize) {
+  private final int maxSize;
+  private final int[] buffer;
+  private int position;
+  private final IntBag frequencies;
+
+  /** Create a new ring buffer that will contain at most <code>maxSize</code> items.
+   *  This buffer will initially contain <code>maxSize</code> times the
+   *  <code>sentinel</code> value. */
+  public FrequencyTrackingRingBuffer(int maxSize, int sentinel) {
+    if (maxSize < 2) {
+      throw new IllegalArgumentException("maxSize must be at least 2");
+    }
     this.maxSize = maxSize;
-    this.ringBuffer = new ArrayDeque<>(maxSize);
-    this.frequencies = new ConcurrentHashMap<>();
+    buffer = new int[maxSize];
+    position = 0;
+    frequencies = new IntBag(maxSize);
+
+    Arrays.fill(buffer, sentinel);
+    for (int i = 0; i < maxSize; ++i) {
+      frequencies.add(sentinel);
+    }
+    assert frequencies.frequency(sentinel) == maxSize;
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return BASE_RAM_BYTES_USED
+        + frequencies.ramBytesUsed()
+        + RamUsageEstimator.sizeOf(buffer);
+  }
+
+  @Override
+  public Iterable<Accountable> getChildResources() {
+    return Collections.emptyList();
   }
 
   /**
    * Add a new item to this ring buffer, potentially removing the oldest
    * entry from this buffer if it is already full.
    */
-  public synchronized void add(T item) {
-    // we need this method to be protected by a lock since it is important for
-    // correctness that the ring buffer and the frequencies table have
-    // consistent content
-    if (item == null) {
-      throw new IllegalArgumentException("null items are not supported");
-    }
-    assert ringBuffer.size() <= maxSize;
-    if (ringBuffer.size() == maxSize) {
-      // evict the oldest entry
-      final T removed = ringBuffer.removeFirst();
-      final int newFrequency = frequency(removed) - 1;
-      if (newFrequency == 0) {
-        // free for GC
-        frequencies.remove(removed);
-      } else {
-        frequencies.put(removed, newFrequency);
-      }
+  public void add(int i) {
+    // remove the previous value
+    final int removed = buffer[position];
+    final boolean removedFromBag = frequencies.remove(removed);
+    assert removedFromBag;
+    // add the new value
+    buffer[position] = i;
+    frequencies.add(i);
+    // increment the position
+    position += 1;
+    if (position == maxSize) {
+      position = 0;
     }
-
-    // add the new entry and update frequencies
-    ringBuffer.addLast(item);
-    frequencies.put(item, frequency(item) + 1);
   }
 
   /**
-   * Returns the frequency of the provided item in the ring buffer.
+   * Returns the frequency of the provided key in the ring buffer.
    */
-  public int frequency(T item) {
-    // The use of a concurrent hash map allows us to not use a lock for this read-only method
-    final Integer freq = frequencies.get(item);
-    return freq == null ? 0 : freq;
+  public int frequency(int key) {
+    return frequencies.frequency(key);
   }
 
   // pkg-private for testing
-  Map<T, Integer> asFrequencyMap() {
-    return Collections.unmodifiableMap(frequencies);
+  Map<Integer, Integer> asFrequencyMap() {
+    return frequencies.asMap();
+  }
+
+  /**
+   * A bag of integers.
+   * Since in the context of the ring buffer the maximum size is known up-front
+   * there is no need to worry about resizing the underlying storage.
+   */
+  private static class IntBag implements Accountable {
+
+    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(IntBag.class);
+
+    private final int[] keys;
+    private final int[] freqs;
+    private final int mask;
+
+    IntBag(int maxSize) {
+      // load factor of 2/3
+      int capacity = Math.max(2, maxSize * 3 / 2);
+      // round up to the next power of two
+      capacity = Integer.highestOneBit(capacity - 1) << 1;
+      assert capacity > maxSize;
+      keys = new int[capacity];
+      freqs = new int[capacity];
+      mask = capacity - 1;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+      return BASE_RAM_BYTES_USED
+          + RamUsageEstimator.sizeOf(keys)
+          + RamUsageEstimator.sizeOf(freqs);
+    }
+
+    @Override
+    public Iterable<Accountable> getChildResources() {
+      return Collections.emptyList();
+    }
+
+    /** Return the frequency of the give key in the bag. */
+    int frequency(int key) {
+      for (int slot = key & mask; ; slot = (slot + 1) & mask) {
+        if (keys[slot] == key) {
+          return freqs[slot];
+        } else if (freqs[slot] == 0) {
+          return 0;
+        }
+      }
+    }
+
+    /** Increment the frequency of the given key by 1 and return its new frequency. */
+    int add(int key) {
+      for (int slot = key & mask; ; slot = (slot + 1) & mask) {
+        if (freqs[slot] == 0) {
+          keys[slot] = key;
+          return freqs[slot] = 1;
+        } else if (keys[slot] == key) {
+          return ++freqs[slot];
+        }
+      }
+    }
+
+    /** Decrement the frequency of the given key by one, or do nothing if the
+     *  key is not present in the bag. Returns true iff the key was contained
+     *  in the bag. */
+    boolean remove(int key) {
+      for (int slot = key & mask; ; slot = (slot + 1) & mask) {
+        if (freqs[slot] == 0) {
+          // no such key in the bag
+          return false;
+        } else if (keys[slot] == key) {
+          final int newFreq = --freqs[slot];
+          if (newFreq == 0) { // removed
+            relocateAdjacentKeys(slot);
+          }
+          return true;
+        }
+      }
+    }
+
+    private void relocateAdjacentKeys(int freeSlot) {
+      for (int slot = (freeSlot + 1) & mask; ; slot = (slot + 1) & mask) {
+        final int freq = freqs[slot];
+        if (freq == 0) {
+          // end of the collision chain, we're done
+          break;
+        }
+        final int key = keys[slot];
+        // the slot where <code>key</code> should be if there were no collisions
+        final int expectedSlot = key & mask;
+        // if the free slot is between the expected slot and the slot where the
+        // key is, then we can relocate there
+        if (between(expectedSlot, slot, freeSlot)) {
+          keys[freeSlot] = key;
+          freqs[freeSlot] = freq;
+          // slot is the new free slot
+          freqs[slot] = 0;
+          freeSlot = slot;
+        }
+      }
+    }
+
+    /** Given a chain of occupied slots between <code>chainStart</code>
+     *  and <code>chainEnd</code>, return whether <code>slot</code> is
+     *  between the start and end of the chain. */
+    private static boolean between(int chainStart, int chainEnd, int slot) {
+      if (chainStart <= chainEnd) {
+        return chainStart <= slot && slot <= chainEnd;
+      } else {
+        // the chain is across the end of the array
+        return slot >= chainStart || slot <= chainEnd;
+      }
+    }
+
+    Map<Integer, Integer> asMap() {
+      Map<Integer, Integer> map = new HashMap<>();
+      for (int i = 0; i < keys.length; ++i) {
+        if (freqs[i] > 0) {
+          map.put(keys[i], freqs[i]);
+        }
+      }
+      return map;
+    }
+
   }
 
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java?rev=1646675&r1=1646674&r2=1646675&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java Fri Dec 19 11:25:30 2014
@@ -24,15 +24,19 @@ import java.util.Map;
 
 public class TestFrequencyTrackingRingBuffer extends LuceneTestCase {
 
-  private static <T> void assertBuffer(FrequencyTrackingRingBuffer<T> buffer, int maxSize, List<T> items) {
-    final List<T> recentItems;
+  private static void assertBuffer(FrequencyTrackingRingBuffer buffer, int maxSize, int sentinel, List<Integer> items) {
+    final List<Integer> recentItems;
     if (items.size() <= maxSize) {
-      recentItems = items;
+      recentItems = new ArrayList<>();
+      for (int i = items.size(); i < maxSize; ++i) {
+        recentItems.add(sentinel);
+      }
+      recentItems.addAll(items);
     } else {
       recentItems = items.subList(items.size() - maxSize, items.size());
     }
-    final Map<T, Integer> expectedFrequencies = new HashMap<T, Integer>();
-    for (T item : recentItems) {
+    final Map<Integer, Integer> expectedFrequencies = new HashMap<Integer, Integer>();
+    for (Integer item : recentItems) {
       final Integer freq = expectedFrequencies.get(item);
       if (freq == null) {
         expectedFrequencies.put(item, 1);
@@ -46,18 +50,29 @@ public class TestFrequencyTrackingRingBu
   public void test() {
     final int iterations = atLeast(100);
     for (int i = 0; i < iterations; ++i) {
-      final int maxSize = 1 + random().nextInt(100);
-      final int numitems = random().nextInt(500);
+      final int maxSize = 2 + random().nextInt(100);
+      final int numitems = random().nextInt(5000);
       final int maxitem = 1 + random().nextInt(100);
       List<Integer> items = new ArrayList<>();
-      FrequencyTrackingRingBuffer<Integer> buffer = new FrequencyTrackingRingBuffer<>(maxSize);
+      final int sentinel = random().nextInt(200);
+      FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel);
       for (int j = 0; j < numitems; ++j) {
         final Integer item = random().nextInt(maxitem);
         items.add(item);
         buffer.add(item);
       }
-      assertBuffer(buffer, maxSize, items);
+      assertBuffer(buffer, maxSize, sentinel, items);
+    }
+  }
+
+  public void testRamBytesUsed() {
+    final int maxSize = 2 + random().nextInt(10000);
+    final int sentinel = random().nextInt();
+    FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel);
+    for (int i = 0; i < 10000; ++i) {
+      buffer.add(random().nextInt());
     }
+    assertEquals(RamUsageTester.sizeOf(buffer), buffer.ramBytesUsed());
   }
 
 }