You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by jg...@apache.org on 2011/10/05 23:14:51 UTC
svn commit: r1179442 - in /hbase/trunk: ./ src/main/java/org/apache/hadoop/hbase/ src/main/java/org/apache/hadoop/hbase/regionserver/ src/main/java/org/apache/hadoop/hbase/util/ src/test/java/org/apache/hadoop/hbase/regionserver/

Author: jgray
Date: Wed Oct  5 21:14:50 2011
New Revision: 1179442

URL: http://svn.apache.org/viewvc?rev=1179442&view=rev
Log:
HBASE-4465  Lazy-seek optimization for StoreFile scanners (mikhail/liyin)

Added:
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java
Removed:
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/AbstractKeyValueScanner.java
Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/KeyValue.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ColumnCount.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/MemStore.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/CollectionBackedScanner.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestMemStore.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Wed Oct  5 21:14:50 2011
@@ -11,6 +11,7 @@ Release 0.93.0 - Unreleased
    HBASE-4477  Ability for an application to store metadata into the        
                transaction log (dhruba via jgray)
    HBASE-4145  Provide metrics for hbase client (Ming Ma)
+   HBASE-4465  Lazy-seek optimization for StoreFile scanners (mikhail/liyin)
 
   BUG FIXES
    HBASE-4488  Store could miss rows during flush (Lars H via jgray)

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/KeyValue.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/KeyValue.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/KeyValue.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/KeyValue.java Wed Oct  5 21:14:50 2011
@@ -1757,6 +1757,21 @@ public class KeyValue implements Writabl
   }
 
   /**
+   * Creates the first KV with the row/family/qualifier of this KV and the
+   * given timestamp. Uses the "maximum" KV type that guarantees that the new
+   * KV is the lowest possible for this combination of row, family, qualifier,
+   * and timestamp. This KV's own timestamp is ignored. While this function
+   * copies the value from this KV, it is normally used on key-only KVs.
+   */
+  public KeyValue createFirstOnRowColTS(long ts) {
+    return new KeyValue(
+        bytes, getRowOffset(), getRowLength(),
+        bytes, getFamilyOffset(), getFamilyLength(),
+        bytes, getQualifierOffset(), getQualifierLength(),
+        ts, Type.Maximum, bytes, getValueOffset(), getValueLength());
+  }
+
+  /**
    * @param b
    * @return A KeyValue made of a byte array that holds the key-only part.
    * Needed to convert hfile index members to KeyValues.

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ColumnCount.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ColumnCount.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ColumnCount.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ColumnCount.java Wed Oct  5 21:14:50 2011
@@ -106,16 +106,4 @@ public class ColumnCount {
     this.count = count;
   }
 
-
-  /**
-   * Check to see if needed to fetch more versions
-   * @param max
-   * @return true if more versions are needed, false otherwise
-   */
-  public boolean needMore(int max) {
-    if(this.count < max) {
-      return true;
-    }
-    return false;
-  }
 }

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueHeap.java Wed Oct  5 21:14:50 2011
@@ -40,34 +40,23 @@ import java.util.PriorityQueue;
  * also implements InternalScanner.  WARNING: As is, if you try to use this
  * as an InternalScanner at the Store level, you will get runtime exceptions.
  */
-public class KeyValueHeap implements KeyValueScanner, InternalScanner {
+public class KeyValueHeap extends NonLazyKeyValueScanner
+    implements KeyValueScanner, InternalScanner {
   private PriorityQueue<KeyValueScanner> heap = null;
-  private KeyValueScanner current = null;
-  private KVScannerComparator comparator;
 
   /**
-   * A helper enum that knows how to call the correct seek function within a
-   * {@link KeyValueScanner}.
+   * The current sub-scanner, i.e. the one that contains the next key/value
+   * to return to the client. This scanner is NOT included in {@link #heap}
+   * (but we frequently add it back to the heap and pull the new winner out).
+   * We maintain an invariant that the current sub-scanner has already done
+   * a real seek, and that current.peek() is always a real key/value (or null)
+   * except for the fake last-key-on-row-column supplied by the multi-column
+   * Bloom filter optimization, which is OK to propagate to StoreScanner. In
+   * order to ensure that, always use {@link #pollRealKV()} to update current.
    */
-  public enum SeekType {
-    NORMAL {
-      @Override
-      public boolean seek(KeyValueScanner scanner, KeyValue kv,
-          boolean forward) throws IOException {
-        return forward ? scanner.reseek(kv) : scanner.seek(kv);
-      }
-    },
-    EXACT {
-      @Override
-      public boolean seek(KeyValueScanner scanner, KeyValue kv,
-          boolean forward) throws IOException {
-        return scanner.seekExactly(kv, forward);
-      }
-    };
+  private KeyValueScanner current = null;
 
-    public abstract boolean seek(KeyValueScanner scanner, KeyValue kv,
-        boolean forward) throws IOException;
-  }
+  private KVScannerComparator comparator;
 
   /**
    * Constructor.  This KeyValueHeap will handle closing of passed in
@@ -76,7 +65,7 @@ public class KeyValueHeap implements Key
    * @param comparator
    */
   public KeyValueHeap(List<? extends KeyValueScanner> scanners,
-      KVComparator comparator) {
+      KVComparator comparator) throws IOException {
     this.comparator = new KVScannerComparator(comparator);
     if (!scanners.isEmpty()) {
       this.heap = new PriorityQueue<KeyValueScanner>(scanners.size(),
@@ -88,7 +77,7 @@ public class KeyValueHeap implements Key
           scanner.close();
         }
       }
-      this.current = heap.poll();
+      this.current = pollRealKV();
     }
   }
 
@@ -107,13 +96,13 @@ public class KeyValueHeap implements Key
     KeyValue kvNext = this.current.peek();
     if (kvNext == null) {
       this.current.close();
-      this.current = this.heap.poll();
+      this.current = pollRealKV();
     } else {
       KeyValueScanner topScanner = this.heap.peek();
       if (topScanner == null ||
           this.comparator.compare(kvNext, topScanner.peek()) >= 0) {
         this.heap.add(this.current);
-        this.current = this.heap.poll();
+        this.current = pollRealKV();
       }
     }
     return kvReturn;
@@ -149,7 +138,7 @@ public class KeyValueHeap implements Key
     } else {
       this.heap.add(this.current);
     }
-    this.current = this.heap.poll();
+    this.current = pollRealKV();
     return (this.current != null);
   }
 
@@ -230,13 +219,20 @@ public class KeyValueHeap implements Key
    * <p>
    * As individual scanners may run past their ends, those scanners are
    * automatically closed and removed from the heap.
+   * <p>
+   * This function (and {@link #reseek(KeyValue)}) does not do multi-column
+   * Bloom filter and lazy-seek optimizations. To enable those, call
+   * {@link #requestSeek(KeyValue, boolean, boolean)}.
    * @param seekKey KeyValue to seek at or after
    * @return true if KeyValues exist at or after specified key, false if not
    * @throws IOException
    */
   @Override
   public boolean seek(KeyValue seekKey) throws IOException {
-    return generalizedSeek(seekKey, SeekType.NORMAL, false);
+    return generalizedSeek(false,    // This is not a lazy seek
+                           seekKey,
+                           false,    // forward (false: this is not a reseek)
+                           false);   // Not using Bloom filters
   }
 
   /**
@@ -245,20 +241,36 @@ public class KeyValueHeap implements Key
    */
   @Override
   public boolean reseek(KeyValue seekKey) throws IOException {
-    return generalizedSeek(seekKey, SeekType.NORMAL, true);
+    return generalizedSeek(false,    // This is not a lazy seek
+                           seekKey,
+                           true,     // forward (true because this is reseek)
+                           false);   // Not using Bloom filters
   }
 
   /**
    * {@inheritDoc}
    */
   @Override
-  public boolean seekExactly(KeyValue seekKey, boolean forward)
-      throws IOException {
-    return generalizedSeek(seekKey, SeekType.EXACT, forward);
+  public boolean requestSeek(KeyValue key, boolean forward,
+      boolean useBloom) throws IOException {
+    return generalizedSeek(true, key, forward, useBloom);
   }
 
-  private boolean generalizedSeek(KeyValue seekKey, SeekType seekType,
-      boolean forward) throws IOException {
+  /**
+   * @param isLazy whether we are trying to seek to exactly the given row/col.
+   *          Enables Bloom filter and most-recent-file-first optimizations for
+   *          multi-column get/scan queries.
+   * @param seekKey key to seek to
+   * @param forward whether to seek forward (also known as reseek)
+   * @param useBloom whether to optimize seeks using Bloom filters
+   */
+  private boolean generalizedSeek(boolean isLazy, KeyValue seekKey,
+      boolean forward, boolean useBloom) throws IOException {
+    if (!isLazy && useBloom) {
+      throw new IllegalArgumentException("Multi-column Bloom filter " +
+          "optimization requires a lazy seek");
+    }
+
     if (current == null) {
       return false;
     }
@@ -269,12 +281,26 @@ public class KeyValueHeap implements Key
     while ((scanner = heap.poll()) != null) {
       KeyValue topKey = scanner.peek();
       if (comparator.getComparator().compare(seekKey, topKey) <= 0) {
-        // Top KeyValue is at-or-after Seek KeyValue
-        current = scanner;
-        return true;
+        // Top KeyValue is at-or-after Seek KeyValue. We only know that all
+        // scanners are at or after seekKey (because fake keys of
+        // scanners where a lazy-seek operation has been done are not greater
+        // than their real next keys) but we still need to enforce our
+        // invariant that the top scanner has done a real seek. This way
+        // StoreScanner and RegionScanner do not have to worry about fake keys.
+        heap.add(scanner);
+        current = pollRealKV();
+        return current != null;
+      }
+
+      boolean seekResult;
+      if (isLazy) {
+        seekResult = scanner.requestSeek(seekKey, forward, useBloom);
+      } else {
+        seekResult = NonLazyKeyValueScanner.doRealSeek(
+            scanner, seekKey, forward);
       }
-      
-      if (!seekType.seek(scanner, seekKey, forward)) {
+
+      if (!seekResult) {
         scanner.close();
       } else {
         heap.add(scanner);
@@ -286,6 +312,62 @@ public class KeyValueHeap implements Key
   }
 
   /**
+   * Fetches the top sub-scanner from the priority queue, ensuring that a real
+   * seek has been done on it. Works by fetching the top sub-scanner, and if it
+   * has not done a real seek, making it do so (which will modify its top KV),
+   * putting it back, and repeating this until success. Relies on the fact that
+   * on a lazy seek we set the current key of a StoreFileScanner to a KV that
+   * is not greater than the real next KV to be read from that file, so the
+   * scanner that bubbles up to the top of the heap will have global next KV in
+   * this scanner heap if (1) it has done a real seek and (2) its KV is the top
+   * among all top KVs (some of which are fake) in the scanner heap.
+   */
+  private KeyValueScanner pollRealKV() throws IOException {
+    KeyValueScanner kvScanner = heap.poll();
+    if (kvScanner == null) {
+      return null;
+    }
+
+    while (kvScanner != null && !kvScanner.realSeekDone()) {
+      if (kvScanner.peek() != null) {
+        kvScanner.enforceSeek();
+        KeyValue curKV = kvScanner.peek();
+        if (curKV != null) {
+          KeyValueScanner nextEarliestScanner = heap.peek();
+          if (nextEarliestScanner == null) {
+            // The heap is empty. Return the only possible scanner.
+            return kvScanner;
+          }
+
+          // Compare the current scanner to the next scanner. We try to avoid
+          // putting the current one back into the heap if possible.
+          KeyValue nextKV = nextEarliestScanner.peek();
+          if (nextKV == null || comparator.compare(curKV, nextKV) <= 0) {
+            // We already have the scanner with the earliest KV, so return it.
+            return kvScanner;
+          }
+
+          // Otherwise, put the scanner back into the heap and let it compete
+          // against all other scanners (both those that have done a "real
+          // seek" and a "lazy seek").
+          heap.add(kvScanner);
+        } else {
+          // Close the scanner because we did a real seek and found out there
+          // are no more KVs.
+          kvScanner.close();
+        }
+      } else {
+        // Close the scanner because it has already run out of KVs even before
+        // we had to do a real seek on it.
+        kvScanner.close();
+      }
+      kvScanner = heap.poll();
+    }
+
+    return kvScanner;
+  }
+
+  /**
    * @return the current Heap
    */
   public PriorityQueue<KeyValueScanner> getHeap() {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/KeyValueScanner.java Wed Oct  5 21:14:50 2011
@@ -57,16 +57,6 @@ public interface KeyValueScanner {
   public boolean reseek(KeyValue key) throws IOException;
 
   /**
-   * Similar to {@link #seek} (or {@link #reseek} if forward is true) but only
-   * does a seek operation after checking that it is really necessary for the
-   * row/column combination specified by the kv parameter. This function was
-   * added to avoid unnecessary disk seeks on multi-column get queries using
-   * Bloom filter checking. Should only be used for queries where the set of
-   * columns is specified exactly.
-   */
-  public boolean seekExactly(KeyValue kv, boolean forward) throws IOException;
-
-  /**
    * Get the sequence id associated with this KeyValueScanner. This is required
    * for comparing multiple files to find out which one has the latest data.
    * The default implementation for this would be to return 0. A file having
@@ -78,4 +68,37 @@ public interface KeyValueScanner {
    * Close the KeyValue scanner.
    */
   public void close();
+
+  // "Lazy scanner" optimizations
+
+  /**
+   * Similar to {@link #seek} (or {@link #reseek} if forward is true) but only
+   * does a seek operation after checking that it is really necessary for the
+   * row/column combination specified by the kv parameter. This function was
+   * added to avoid unnecessary disk seeks by checking row-column Bloom filters
+   * before a seek on multi-column get/scan queries, and to optimize by looking
+   * up more recent files first.
+   * @param forward do a forward-only "reseek" instead of a random-access seek
+   * @param useBloom whether to enable multi-column Bloom filter optimization
+   */
+  public boolean requestSeek(KeyValue kv, boolean forward, boolean useBloom)
+      throws IOException;
+
+  /**
+   * We optimize our store scanners by checking the most recent store file
+   * first, so we sometimes pretend we have done a seek but delay it until the
+   * store scanner bubbles up to the top of the key-value heap. This method is
+   * then used to ensure the top store file scanner has done a seek operation.
+   */
+  public boolean realSeekDone();
+
+  /**
+   * Does the real seek operation in case it was skipped by
+   * {@link #seekToRowCol(KeyValue, boolean)}. Note that this function should
+   * be never called on scanners that always do real seek operations (i.e. most
+   * of the scanners). The easiest way to achieve this is to call
+   * {@link #realSeekDone()} first.
+   */
+  public void enforceSeek() throws IOException;
+
 }

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/MemStore.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/MemStore.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/MemStore.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/MemStore.java Wed Oct  5 21:14:50 2011
@@ -632,7 +632,7 @@ public class MemStore implements HeapSiz
    * map and snapshot.
    * This behaves as if it were a real scanner but does not maintain position.
    */
-  protected class MemStoreScanner extends AbstractKeyValueScanner {
+  protected class MemStoreScanner extends NonLazyKeyValueScanner {
     // Next row information for either kvset or snapshot
     private KeyValue kvsetNextRow = null;
     private KeyValue snapshotNextRow = null;

Added: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java?rev=1179442&view=auto
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java (added)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/NonLazyKeyValueScanner.java Wed Oct  5 21:14:50 2011
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.IOException;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.hadoop.hbase.KeyValue;
+
+/**
+ * A "non-lazy" scanner which always does a real seek operation. Most scanners
+ * are inherited from this class.
+ */
+public abstract class NonLazyKeyValueScanner implements KeyValueScanner {
+
+  @Override
+  public boolean requestSeek(KeyValue kv, boolean forward, boolean useBloom)
+      throws IOException {
+    return doRealSeek(this, kv, forward);
+  }
+
+  @Override
+  public boolean realSeekDone() {
+    return true;
+  }
+
+  @Override
+  public void enforceSeek() throws IOException {
+    throw new NotImplementedException("enforceSeek must not be called on a " +
+        "non-lazy scanner");
+  }
+
+  public static boolean doRealSeek(KeyValueScanner scanner,
+      KeyValue kv, boolean forward) throws IOException {
+    return forward ? scanner.reseek(kv) : scanner.seek(kv);
+  }
+
+}

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/ScanQueryMatcher.java Wed Oct  5 21:14:50 2011
@@ -59,12 +59,6 @@ public class ScanQueryMatcher {
 
   /** Row the query is on */
   protected byte [] row;
-  
-  /** 
-   * True if we are only interested in the given exact set of columns. In that
-   * case we can use Bloom filters to avoid unnecessary disk seeks.
-   */
-  private boolean exactColumnQuery;
 
   /**
    * Constructs a ScanQueryMatcher for a Scan.
@@ -95,7 +89,6 @@ public class ScanQueryMatcher {
       // between rows, not between storefiles.
       this.columns = new ExplicitColumnTracker(columns, minVersions, maxVersions,
           ttl);
-      exactColumnQuery = true;
     }
   }
 
@@ -313,10 +306,6 @@ public class ScanQueryMatcher {
         null, 0, 0);
   }
 
-  public boolean isExactColumnQuery() {
-    return exactColumnQuery;
-  }
-
   /**
    * {@link #match} return codes.  These instruct the scanner moving through
    * memstores and StoreFiles what to do with the current KeyValue.

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java Wed Oct  5 21:14:50 2011
@@ -1376,6 +1376,10 @@ public class StoreFile {
     void disableBloomFilterForTesting() {
       bloomFilter = null;
     }
+
+    public long getMaxTimestamp() {
+      return timeRangeTracker.maximumTimestamp;
+    }
   }
 
   /**

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileScanner.java Wed Oct  5 21:14:50 2011
@@ -28,11 +28,11 @@ import org.apache.hadoop.hbase.io.hfile.
 import org.apache.hadoop.hbase.regionserver.StoreFile.Reader;
 
 import java.io.IOException;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.SortedSet;
-import java.util.concurrent.atomic.AtomicLong;
 
 /**
  * KeyValueScanner adaptor over the Reader.  It also provides hooks into
@@ -46,6 +46,10 @@ class StoreFileScanner implements KeyVal
   private final HFileScanner hfs;
   private KeyValue cur = null;
 
+  private boolean realSeekDone;
+  private boolean delayedReseek;
+  private KeyValue delayedSeekKV;
+
   private static final AtomicLong seekCount = new AtomicLong();
 
   /**
@@ -99,12 +103,16 @@ class StoreFileScanner implements KeyVal
   public boolean seek(KeyValue key) throws IOException {
     seekCount.incrementAndGet();
     try {
-      if(!seekAtOrAfter(hfs, key)) {
-        close();
-        return false;
+      try {
+        if(!seekAtOrAfter(hfs, key)) {
+          close();
+          return false;
+        }
+        cur = hfs.getKeyValue();
+        return true;
+      } finally {
+        realSeekDone = true;
       }
-      cur = hfs.getKeyValue();
-      return true;
     } catch(IOException ioe) {
       throw new IOException("Could not seek " + this, ioe);
     }
@@ -113,12 +121,16 @@ class StoreFileScanner implements KeyVal
   public boolean reseek(KeyValue key) throws IOException {
     seekCount.incrementAndGet();
     try {
-      if (!reseekAtOrAfter(hfs, key)) {
-        close();
-        return false;
+      try {
+        if (!reseekAtOrAfter(hfs, key)) {
+          close();
+          return false;
+        }
+        cur = hfs.getKeyValue();
+        return true;
+      } finally {
+        realSeekDone = true;
       }
-      cur = hfs.getKeyValue();
-      return true;
     } catch (IOException ioe) {
       throw new IOException("Could not seek " + this, ioe);
     }
@@ -174,27 +186,72 @@ class StoreFileScanner implements KeyVal
     return reader.getSequenceID();
   }
 
+  /**
+   * Pretend we have done a seek but don't do it yet, if possible. The hope is
+   * that we find requested columns in more recent files and won't have to seek
+   * in older files. Creates a fake key/value with the given row/column and the
+   * highest (most recent) possible timestamp we might get from this file. When
+   * users of such "lazy scanner" need to know the next KV precisely (e.g. when
+   * this scanner is at the top of the heap), they run {@link #enforceSeek()}.
+   * <p>
+   * Note that this function does guarantee that the current KV of this scanner
+   * will be advanced to at least the given KV. Because of this, it does have
+   * to do a real seek in cases when the seek timestamp is older than the
+   * highest timestamp of the file, e.g. when we are trying to seek to the next
+   * row/column and use OLDEST_TIMESTAMP in the seek key.
+   */
   @Override
-  public boolean seekExactly(KeyValue kv, boolean forward)
+  public boolean requestSeek(KeyValue kv, boolean forward, boolean useBloom)
       throws IOException {
     if (reader.getBloomFilterType() != StoreFile.BloomType.ROWCOL ||
-        kv.getRowLength() == 0 || kv.getQualifierLength() == 0) {
-      return forward ? reseek(kv) : seek(kv);
+        kv.getFamilyLength() == 0) {
+      useBloom = false;
     }
 
-    boolean isInBloom = reader.passesBloomFilter(kv.getBuffer(),
-        kv.getRowOffset(), kv.getRowLength(), kv.getBuffer(),
-        kv.getQualifierOffset(), kv.getQualifierLength());
-    if (isInBloom) {
-      // This row/column might be in this store file. Do a normal seek.
-      return forward ? reseek(kv) : seek(kv);
+    boolean haveToSeek = true;
+    if (useBloom) {
+      haveToSeek = reader.passesBloomFilter(kv.getBuffer(),
+          kv.getRowOffset(), kv.getRowLength(), kv.getBuffer(),
+          kv.getQualifierOffset(), kv.getQualifierLength());
     }
 
+    delayedReseek = forward;
+    delayedSeekKV = kv;
+
+    if (haveToSeek) {
+      // This row/column might be in this store file (or we did not use the
+      // Bloom filter), so we still need to seek.
+      realSeekDone = false;
+      long maxTimestampInFile = reader.getMaxTimestamp();
+      long seekTimestamp = kv.getTimestamp();
+      if (seekTimestamp > maxTimestampInFile) {
+        // Create a fake key that is not greater than the real next key.
+        // (Lower timestamps correspond to higher KVs.)
+        // To understand this better, consider that we are asked to seek to
+        // a higher timestamp than the max timestamp in this file. We know that
+        // the next point when we have to consider this file again is when we
+        // pass the max timestamp of this file (with the same row/column).
+        cur = kv.createFirstOnRowColTS(maxTimestampInFile);
+      } else {
+        // This will be the case e.g. when we need to seek to the next
+        // row/column, and we don't know exactly what they are, so we set the
+        // seek key's timestamp to OLDEST_TIMESTAMP to skip the rest of this
+        // row/column.
+        enforceSeek();
+      }
+      return cur != null;
+    }
+
+    // Multi-column Bloom filter optimization.
     // Create a fake key/value, so that this scanner only bubbles up to the top
     // of the KeyValueHeap in StoreScanner after we scanned this row/column in
     // all other store files. The query matcher will then just skip this fake
-    // key/value and the store scanner will progress to the next column.
+    // key/value and the store scanner will progress to the next column. This
+    // is obviously not a "real real" seek, but unlike the fake KV earlier in
+    // this method, we want this to be propagated to ScanQueryMatcher.
     cur = kv.createLastOnRowCol();
+
+    realSeekDone = true;
     return true;
   }
 
@@ -202,6 +259,23 @@ class StoreFileScanner implements KeyVal
     return reader;
   }
 
+  @Override
+  public boolean realSeekDone() {
+    return realSeekDone;
+  }
+
+  @Override
+  public void enforceSeek() throws IOException {
+    if (realSeekDone)
+      return;
+
+    if (delayedReseek) {
+      reseek(delayedSeekKV);
+    } else {
+      seek(delayedSeekKV);
+    }
+  }
+
   // Test methods
 
   static final long getSeekCount() {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java Wed Oct  5 21:14:50 2011
@@ -36,7 +36,8 @@ import java.util.NavigableSet;
  * Scanner scans both the memstore and the HStore. Coalesce KeyValue stream
  * into List<KeyValue> for a single row.
  */
-class StoreScanner implements KeyValueScanner, InternalScanner, ChangedReadersObserver {
+class StoreScanner extends NonLazyKeyValueScanner
+    implements KeyValueScanner, InternalScanner, ChangedReadersObserver {
   static final Log LOG = LogFactory.getLog(StoreScanner.class);
   private Store store;
   private ScanQueryMatcher matcher;
@@ -47,6 +48,8 @@ class StoreScanner implements KeyValueSc
   // Doesnt need to be volatile because it's always accessed via synchronized methods
   private boolean closing = false;
   private final boolean isGet;
+  private final boolean explicitColumnQuery;
+  private final boolean useRowColBloom;
 
   /** We don't ever expect to change this, the constant is just for clarity. */
   static final boolean LAZY_SEEK_ENABLED_BY_DEFAULT = true;
@@ -58,6 +61,22 @@ class StoreScanner implements KeyValueSc
   // if heap == null and lastTop != null, you need to reseek given the key below
   private KeyValue lastTop = null;
 
+  /** An internal constructor. */
+  private StoreScanner(Store store, boolean cacheBlocks, Scan scan,
+      final NavigableSet<byte[]> columns){
+    this.store = store;
+    this.cacheBlocks = cacheBlocks;
+    isGet = scan.isGetScan();
+    int numCol = columns == null ? 0 : columns.size();
+    explicitColumnQuery = numCol > 0;
+
+    // We look up row-column Bloom filters for multi-column queries as part of
+    // the seek operation. However, we also look the row-column Bloom filter
+    // for multi-row (non-"get") scans because this is not done in
+    // StoreFile.passesBloomFilter(Scan, SortedSet<byte[]>).
+    useRowColBloom = numCol > 1 || (!isGet && numCol == 1);
+  }
+
   /**
    * Opens a scanner across memstore, snapshot, and all StoreFiles.
    *
@@ -68,25 +87,25 @@ class StoreScanner implements KeyValueSc
    */
   StoreScanner(Store store, Scan scan, final NavigableSet<byte[]> columns)
                               throws IOException {
-    this.store = store;
-    this.cacheBlocks = scan.getCacheBlocks();
+    this(store, scan.getCacheBlocks(), scan, columns);
     matcher = new ScanQueryMatcher(scan, store.getFamily().getName(),
         columns, store.ttl, store.comparator.getRawComparator(),
         store.minVersions, store.versionsToReturn(scan.getMaxVersions()),
         false);
 
-    this.isGet = scan.isGetScan();
-    // pass columns = try to filter out unnecessary ScanFiles
+    // Pass columns to try to filter out unnecessary StoreFiles.
     List<KeyValueScanner> scanners = getScanners(scan, columns);
 
     // Seek all scanners to the start of the Row (or if the exact matching row
     // key does not exist, then to the start of the next matching Row).
-    if (matcher.isExactColumnQuery()) {
-      for (KeyValueScanner scanner : scanners)
-        scanner.seekExactly(matcher.getStartKey(), false);
+    if (explicitColumnQuery && lazySeekEnabledGlobally) {
+      for (KeyValueScanner scanner : scanners) {
+        scanner.requestSeek(matcher.getStartKey(), false, useRowColBloom);
+      }
     } else {
-      for (KeyValueScanner scanner : scanners)
+      for (KeyValueScanner scanner : scanners) {
         scanner.seek(matcher.getStartKey());
+      }
     }
 
     // Combine all seeked scanners with a heap
@@ -104,11 +123,8 @@ class StoreScanner implements KeyValueSc
    * @param scanners ancilliary scanners
    */
   StoreScanner(Store store, Scan scan, List<? extends KeyValueScanner> scanners,
-      boolean retainDeletesInOutput)
-  throws IOException {
-    this.store = store;
-    this.cacheBlocks = false;
-    this.isGet = false;
+      boolean retainDeletesInOutput) throws IOException {
+    this(store, false, scan, null);
     matcher = new ScanQueryMatcher(scan, store.getFamily().getName(),
         null, store.ttl, store.comparator.getRawComparator(), store.minVersions,
         store.versionsToReturn(scan.getMaxVersions()), retainDeletesInOutput);
@@ -128,9 +144,7 @@ class StoreScanner implements KeyValueSc
       final NavigableSet<byte[]> columns,
       final List<KeyValueScanner> scanners)
         throws IOException {
-    this.store = null;
-    this.isGet = false;
-    this.cacheBlocks = scan.getCacheBlocks();
+    this(null, scan.getCacheBlocks(), scan, columns);
     this.matcher = new ScanQueryMatcher(scan, colFamily, columns, ttl,
         comparator.getRawComparator(), 0, scan.getMaxVersions(), false);
 
@@ -238,7 +252,6 @@ class StoreScanner implements KeyValueSc
    * @return true if there are more rows, false if scanner is done
    */
   public synchronized boolean next(List<KeyValue> outResult, int limit) throws IOException {
-    //DebugPrint.println("SS.next");
 
     checkReseek();
 
@@ -280,7 +293,6 @@ class StoreScanner implements KeyValueSc
       }
       prevKV = copyKv;
       ScanQueryMatcher.MatchCode qcode = matcher.match(copyKv);
-
       switch(qcode) {
         case INCLUDE:
         case INCLUDE_AND_SEEK_NEXT_ROW:
@@ -319,8 +331,8 @@ class StoreScanner implements KeyValueSc
           return false;
 
         case SEEK_NEXT_ROW:
-          // This is just a relatively simple end of scan fix, to short-cut end us if there is a
-          // endKey in the scan.
+          // This is just a relatively simple end of scan fix, to short-cut end
+          // us if there is an endKey in the scan.
           if (!matcher.moreRowsMayExistAfter(kv)) {
             outResult.addAll(results);
             return false;
@@ -431,8 +443,11 @@ class StoreScanner implements KeyValueSc
   public synchronized boolean reseek(KeyValue kv) throws IOException {
     //Heap cannot be null, because this is only called from next() which
     //guarantees that heap will never be null before this call.
-    return matcher.isExactColumnQuery() ? heap.seekExactly(kv, true) : 
-        heap.reseek(kv);
+    if (explicitColumnQuery && lazySeekEnabledGlobally) {
+      return heap.requestSeek(kv, true, useRowColBloom);
+    } else {
+      return heap.reseek(kv);
+    }
   }
 
   @Override
@@ -440,11 +455,6 @@ class StoreScanner implements KeyValueSc
     return 0;
   }
 
-  @Override
-  public boolean seekExactly(KeyValue kv, boolean forward) throws IOException {
-    throw new NotImplementedException();
-  }
-
   /**
    * Used in testing.
    * @return all scanners in no particular order
@@ -464,3 +474,4 @@ class StoreScanner implements KeyValueSc
   }
 
 }
+

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/CollectionBackedScanner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/CollectionBackedScanner.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/CollectionBackedScanner.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/CollectionBackedScanner.java Wed Oct  5 21:14:50 2011
@@ -26,14 +26,13 @@ import java.util.List;
 import java.util.SortedSet;
 
 import org.apache.hadoop.hbase.KeyValue;
-import org.apache.hadoop.hbase.regionserver.AbstractKeyValueScanner;
-import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
+import org.apache.hadoop.hbase.regionserver.NonLazyKeyValueScanner;
 
 /**
  * Utility scanner that wraps a sortable collection and serves
  * as a KeyValueScanner.
  */
-public class CollectionBackedScanner extends AbstractKeyValueScanner {
+public class CollectionBackedScanner extends NonLazyKeyValueScanner {
   final private Iterable<KeyValue> data;
   final KeyValue.KVComparator comparator;
   private Iterator<KeyValue> iter;

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestBlocksRead.java Wed Oct  5 21:14:50 2011
@@ -253,8 +253,8 @@ public class TestBlocksRead extends HBas
     assertEquals(1, kvs.length);
     verifyData(kvs[0], "row", "col1", 3);
 
-    // Baseline expected blocks read: 4
-    kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2"), 4);
+    // Expected block reads: 3
+    kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2"), 3);
     assertEquals(2, kvs.length);
     verifyData(kvs[0], "row", "col1", 3);
     verifyData(kvs[1], "row", "col2", 4);
@@ -263,8 +263,8 @@ public class TestBlocksRead extends HBas
     putData(FAMILY, "row", "col3", 5);
     region.flushcache();
 
-    // Baseline expected blocks read: 5
-    kvs = getData(FAMILY, "row", "col3", 5);
+    // Baseline expected blocks read: 3
+    kvs = getData(FAMILY, "row", "col3", 3);
     assertEquals(1, kvs.length);
     verifyData(kvs[0], "row", "col3", 5);
 
@@ -309,8 +309,8 @@ public class TestBlocksRead extends HBas
     putData(FAMILY, "row", "col3", 13);
     region.flushcache();
 
-    // Baseline expected blocks read: 13
-    kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 13);
+    // Baseline expected blocks read: 9
+    kvs = getData(FAMILY, "row", Arrays.asList("col1", "col2", "col3"), 9);
     assertEquals(3, kvs.length);
     verifyData(kvs[0], "row", "col1", 11);
     verifyData(kvs[1], "row", "col2", 12);

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestMemStore.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestMemStore.java?rev=1179442&r1=1179441&r2=1179442&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestMemStore.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/regionserver/TestMemStore.java Wed Oct  5 21:14:50 2011
@@ -533,6 +533,7 @@ public class TestMemStore extends TestCa
    * @throws InterruptedException
    */
   public void testGetNextRow() throws Exception {
+    ReadWriteConsistencyControl.resetThreadReadPoint();
     addRows(this.memstore);
     // Add more versions to make it a little more interesting.
     Thread.sleep(1);