You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/11/06 17:43:36 UTC

svn commit: r1539392 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/ java/org/apache/hadoop/hive/ql/exec/vector/expressions/ java/org/apache/hadoop/hive/ql/optimizer/physical/ test/org/apache/hadoop/hive/ql/exec/vector/ test/org/ap...

Author: hashutosh
Date: Wed Nov  6 16:43:36 2013
New Revision: 1539392

URL: http://svn.apache.org/r1539392
Log:
HIVE-5583 : Implement support for IN (list-of-constants) filter in vectorized mode (Eric Hanson via Ashutosh Chauhan)

Added:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java Wed Nov  6 16:43:36 2013
@@ -76,6 +76,7 @@ import org.apache.hadoop.hive.ql.plan.Ex
 import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
 import org.apache.hadoop.hive.ql.udf.UDFConv;
 import org.apache.hadoop.hive.ql.udf.UDFHex;
 import org.apache.hadoop.hive.ql.udf.UDFOPNegative;
@@ -557,6 +558,8 @@ public class VectorizationContext {
     //First handle special cases
     if (udf instanceof GenericUDFBetween) {
       return getBetweenFilterExpression(childExpr);
+    } else if (udf instanceof GenericUDFIn) {
+      return getInFilterExpression(childExpr);
     } else if (udf instanceof GenericUDFBridge) {
       VectorExpression v = getGenericUDFBridgeVectorExpression((GenericUDFBridge) udf, childExpr, mode);
       if (v != null) {
@@ -579,6 +582,104 @@ public class VectorizationContext {
   }
 
   /**
+   * Create a filter expression for column IN ( <list-of-constants> )
+   * @param childExpr
+   * @return
+   */
+  private VectorExpression getInFilterExpression(List<ExprNodeDesc> childExpr)
+      throws HiveException {
+    ExprNodeDesc colExpr = childExpr.get(0);
+    String colType = colExpr.getTypeString();
+
+    // prepare arguments for createVectorExpression
+    List<ExprNodeDesc> childrenForInList =
+        foldConstantsForUnaryExprs(childExpr.subList(1, childExpr.size()));
+
+    // Remove nulls. This is safe because "value IN ( <list> )" is never true for a NULL member
+    // of <list>, under SQL semantics, because value = NULL is always false.
+    childrenForInList = removeNullListEntries(childrenForInList);
+    VectorExpression expr = null;
+
+    // determine class
+    Class<?> cl = null;
+    if (isIntFamily(colType)) {
+      cl = FilterLongColumnInList.class;
+      long[] inVals = new long[childrenForInList.size()];
+      for (int i = 0; i != inVals.length; i++) {
+        inVals[i] = getIntFamilyScalarAsLong((ExprNodeConstantDesc) childrenForInList.get(i));
+      }
+      FilterLongColumnInList f = (FilterLongColumnInList)
+          createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+      f.setInListValues(inVals);
+      expr = f;
+    } else if (colType.equals("timestamp")) {
+      cl = FilterLongColumnInList.class;
+      long[] inVals = new long[childrenForInList.size()];
+      for (int i = 0; i != inVals.length; i++) {
+        inVals[i] = getTimestampScalar(childrenForInList.get(i));
+      }
+      FilterLongColumnInList f = (FilterLongColumnInList)
+          createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+      f.setInListValues(inVals);
+      expr = f;
+    } else if (colType.equals("string")) {
+      cl = FilterStringColumnInList.class;
+      byte[][] inVals = new byte[childrenForInList.size()][];
+      for (int i = 0; i != inVals.length; i++) {
+        inVals[i] = getStringScalarAsByteArray((ExprNodeConstantDesc) childrenForInList.get(i));
+      }
+      FilterStringColumnInList f =(FilterStringColumnInList)
+          createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+      f.setInListValues(inVals);
+      expr = f;
+    } else if (isFloatFamily(colType)) {
+      cl = FilterDoubleColumnInList.class;
+      double[] inValsD = new double[childrenForInList.size()];
+      for (int i = 0; i != inValsD.length; i++) {
+        inValsD[i] = getNumericScalarAsDouble(childrenForInList.get(i));
+      }
+      FilterDoubleColumnInList f = (FilterDoubleColumnInList)
+          createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+      f.setInListValues(inValsD);
+      expr = f;
+    } else {
+      throw new HiveException("Type " + colType + " not supported for IN in vectorized mode");
+    }
+    return expr;
+  }
+
+  // Return a version of the input IN list with the NULL entries removed.
+  private List<ExprNodeDesc> removeNullListEntries(List<ExprNodeDesc> childrenForInList) {
+    boolean hasNulls = false;
+    for (ExprNodeDesc e : childrenForInList) {
+      if (e instanceof ExprNodeNullDesc) {
+        hasNulls = true;
+        break;
+      }
+    }
+    if (!hasNulls) {
+      return childrenForInList;
+    } else {
+      List<ExprNodeDesc> nullFreeList = new ArrayList<ExprNodeDesc>();
+      for (ExprNodeDesc e : childrenForInList) {
+        if (!(e instanceof ExprNodeNullDesc)) {
+           nullFreeList.add(e);
+        }
+      }
+      return nullFreeList;
+    }
+  }
+
+  private byte[] getStringScalarAsByteArray(ExprNodeConstantDesc exprNodeConstantDesc)
+      throws HiveException {
+    Object o = getScalarValue(exprNodeConstantDesc);
+    if (!(o instanceof byte[])) {
+      throw new HiveException("Expected constant argument of type string");
+    }
+    return (byte[]) o;
+  }
+
+  /**
    * Invoke special handling for expressions that can't be vectorized by regular
    * descriptor based lookup.
    */
@@ -850,8 +951,38 @@ public class VectorizationContext {
     }
   }
 
-  // Get a timestamp as a long in number of nanos, from a string constant.
+  private long getIntFamilyScalarAsLong(ExprNodeConstantDesc constDesc)
+      throws HiveException {
+    Object o = getScalarValue(constDesc);
+    if (o instanceof Integer) {
+      return (Integer) o;
+    } else if (o instanceof Long) {
+      return (Long) o;
+    }
+    throw new HiveException("Unexpected type when converting to long");
+  }
+
+  private double getNumericScalarAsDouble(ExprNodeDesc constDesc)
+      throws HiveException {
+    Object o = getScalarValue((ExprNodeConstantDesc) constDesc);
+    if (o instanceof Double) {
+      return (Double) o;
+    } else if (o instanceof Float) {
+      return (Float) o;
+    } else if (o instanceof Integer) {
+      return (Integer) o;
+    } else if (o instanceof Long) {
+      return (Long) o;
+    }
+    throw new HiveException("Unexpected type when converting to double");
+  }
+
+  // Get a timestamp as a long in number of nanos, from a string constant or cast
   private long getTimestampScalar(ExprNodeDesc expr) throws HiveException {
+    if (expr instanceof ExprNodeGenericFuncDesc &&
+        ((ExprNodeGenericFuncDesc) expr).getGenericUDF() instanceof GenericUDFTimestamp) {
+      return evaluateCastToTimestamp(expr);
+    }
     if (!(expr instanceof ExprNodeConstantDesc)) {
       throw new HiveException("Constant timestamp value expected for expression argument. " +
           "Non-constant argument not supported for vectorization.");
@@ -868,25 +999,29 @@ public class VectorizationContext {
       expr2.setChildren(children);
 
       // initialize and evaluate
-      ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2);
-      ObjectInspector output = evaluator.initialize(null);
-      Object constant = evaluator.evaluate(null);
-      Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
-
-      if (!(java instanceof Timestamp)) {
-        throw new HiveException("Udf: failed to convert from string to timestamp");
-      }
-      Timestamp ts = (Timestamp) java;
-      long result = ts.getTime();
-      result *= 1000000;    // shift left 6 digits to make room for nanos below ms precision
-      result += ts.getNanos() % 1000000;     // add in nanos, after removing the ms portion
-      return result;
+      return evaluateCastToTimestamp(expr2);
     }
 
     throw new HiveException("Udf: unhandled constant type for scalar argument. "
         + "Expecting string.");
   }
 
+  private long evaluateCastToTimestamp(ExprNodeDesc expr) throws HiveException {
+    ExprNodeGenericFuncDesc expr2 = (ExprNodeGenericFuncDesc) expr;
+    ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2);
+    ObjectInspector output = evaluator.initialize(null);
+    Object constant = evaluator.evaluate(null);
+    Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
+
+    if (!(java instanceof Timestamp)) {
+      throw new HiveException("Udf: failed to convert to timestamp");
+    }
+    Timestamp ts = (Timestamp) java;
+    long result = ts.getTime();
+    result *= 1000000;    // shift left 6 digits to make room for nanos below ms precision
+    result += ts.getNanos() % 1000000;     // add in nanos, after removing the ms portion
+    return result;
+  }
 
   private Constructor<?> getConstructor(Class<?> cl) throws HiveException {
     try {

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ *       column IN ( <list-of-values )
+ *
+ * For details on the algorithm, see R. Pagh and F. F. Rodler, "Cuckoo Hashing,"
+ * Elsevier Science preprint, Dec. 2003. http://www.itu.dk/people/pagh/papers/cuckoo-jour.pdf.
+ */
+public class CuckooSetBytes {
+  private byte t1[][];
+  private byte t2[][];
+  private byte prev1[][] = null; // used for rehashing to get last set of values
+  private byte prev2[][] = null; // " "
+  private int n; // current array size
+  private static final double PADDING_FACTOR = 1.0/0.40; // have minimum 40% fill factor
+  private int salt = 0;
+  private Random gen = new Random(676983475);
+  private int rehashCount = 0;
+  private static long INT_MASK  = 0x00000000ffffffffL;
+  private static long BYTE_MASK = 0x00000000000000ffL;
+
+  /**
+   * Allocate a new set to hold expectedSize values. Re-allocation to expand
+   * the set is not implemented, so the expected size must be at least the
+   * size of the set to be inserted.
+   * @param expectedSize At least the size of the set of values that will be inserted.
+   */
+  public CuckooSetBytes(int expectedSize) {
+
+    // Choose array size. We have two hash tables to hold entries, so the sum
+    // of the two should have a bit more than twice as much space as the
+    // minimum required.
+    n = (int) (expectedSize * PADDING_FACTOR / 2.0);
+
+    // some prime numbers spaced about at powers of 2 in magnitude
+
+    // try to get prime number table size to have less dependence on good hash function
+    int primes[] = CuckooSetLong.primes;
+    for (int i = 0; i != primes.length; i++) {
+      if (n <= primes[i]) {
+        n = primes[i];
+        break;
+      }
+    }
+
+    t1 = new byte[n][];
+    t2 = new byte[n][];
+    updateHashSalt();
+  }
+
+  /**
+   * Return true if and only if the value in byte array b beginning at start
+   * and ending at start+len is present in the set.
+   */
+  public boolean lookup(byte[] b, int start, int len) {
+
+    return entryEqual(t1, h1(b, start, len), b, start, len)
+        || entryEqual(t2, h2(b, start, len), b, start, len);
+  }
+
+  private static boolean entryEqual(byte[][] t, int hash, byte[] b, int start, int len) {
+    return t[hash] != null && StringExpr.compare(t[hash], 0, t[hash].length, b, start, len) == 0;
+  }
+
+  public void insert(byte[] x) {
+    byte[] temp;
+    if (lookup(x, 0, x.length)) {
+      return;
+    }
+
+    // Try to insert up to n times. Rehash if that fails.
+    for(int i = 0; i != n; i++) {
+      int hash1 = h1(x, 0, x.length);
+      if (t1[hash1] == null) {
+        t1[hash1] = x;
+        return;
+      }
+
+      // swap x and t1[h1(x)]
+      temp = t1[hash1];
+      t1[hash1] = x;
+      x = temp;
+
+      int hash2 = h2(x, 0, x.length);
+      if (t2[hash2] == null) {
+        t2[hash2] = x;
+        return;
+      }
+
+      // swap x and t2[h2(x)]
+      temp = t2[hash2];
+      t2[hash2] = x;
+      x = temp;
+    }
+    rehash();
+    insert(x);
+  }
+
+  /**
+   * Insert all values in the input array into the set.
+   */
+  public void load(byte[][] a) {
+    for (byte[] x : a) {
+      insert(x);
+    }
+  }
+
+  /**
+   * Try to insert with up to n value's "poked out". Return the last value poked out.
+   * If the value is not blank then we assume there was a cycle.
+   * Don't try to insert the same value twice. This is for use in rehash only,
+   * so you won't see the same value twice.
+   */
+  private byte[] tryInsert(byte[] x) {
+    byte[] temp;
+
+    for(int i = 0; i != n; i++) {
+      int hash1 = h1(x, 0, x.length);
+      if (t1[hash1] == null) {
+        t1[hash1] = x;
+        return null;
+      }
+
+      // swap x and t1[h1(x)]
+      temp = t1[hash1];
+      t1[hash1] = x;
+      x = temp;
+
+      int hash2 = h2(x, 0, x.length);
+      if (t2[hash2] == null) {
+        t2[hash2] = x;
+        return null;
+      }
+
+      // swap x and t2[h2(x)]
+      temp = t2[hash2];
+      t2[hash2] = x;
+      x = temp;
+      if (x == null) {
+        break;
+      }
+    }
+    return x;
+  }
+
+  /**
+   * first hash function
+   */
+  private int h1(byte[] b, int start, int len) {
+
+    // AND hash with mask to 0 out sign bit to make sure it's positive.
+    // Then we know taking the result mod n is in the range (0..n-1).
+    return (hash(b, start, len, 0) & 0x7FFFFFFF) % n;
+  }
+
+  /**
+   * second hash function
+   */
+  private int h2(byte[] b, int start, int len) {
+
+    // AND hash with mask to 0 out sign bit to make sure it's positive.
+    // Then we know taking the result mod n is in the range (0..n-1).
+    // Include salt as argument so this hash function can be varied
+    // if we need to rehash.
+    return (hash(b, start, len, salt) & 0x7FFFFFFF) % n;
+  }
+
+  /**
+   * In case of rehash, hash function h2 is changed by updating the
+   * salt value passed in to the function hash().
+   */
+  private void updateHashSalt() {
+    salt = gen.nextInt(0x7FFFFFFF);
+  }
+
+  private void rehash() {
+    rehashCount++;
+    if (rehashCount > 20) {
+      throw new RuntimeException("Too many rehashes");
+    }
+    updateHashSalt();
+
+    // Save original values
+    if (prev1 == null) {
+      prev1 = t1;
+      prev1 = t2;
+    }
+    t1 = new byte[n][];
+    t2 = new byte[n][];
+    for (byte[] v  : prev1) {
+      if (v != null) {
+        byte[] x = tryInsert(v);
+        if (x != null) {
+          rehash();
+          return;
+        }
+      }
+    }
+    for (byte[] v  : prev2) {
+      if (v != null) {
+        byte[] x = tryInsert(v);
+        if (x != null) {
+          rehash();
+          return;
+        }
+      }
+    }
+
+    // We succeeded in adding all the values, so
+    // clear the previous values recorded.
+    prev1 = null;
+    prev2 = null;
+  }
+
+  /**
+   * This is adapted from the org.apache.hadoop.util.hash.JenkinsHash package.
+   * The interface needed to be modified to suit the use here, by adding
+   * a start offset parameter to the hash function.
+   *
+   * In the future, folding this back into the original Hadoop package should
+   * be considered. This could could them import that package and use it.
+   * The original comments from the source are below.
+   *
+   * taken from  hashlittle() -- hash a variable-length key into a 32-bit value
+   *
+   * @param key the key (the unaligned variable-length array of bytes)
+   * @param nbytes number of bytes to include in hash
+   * @param initval can be any integer value
+   * @return a 32-bit value.  Every bit of the key affects every bit of the
+   * return value.  Two keys differing by one or two bits will have totally
+   * different hash values.
+   *
+   * <p>The best hash table sizes are powers of 2.  There is no need to do mod
+   * a prime (mod is sooo slow!).  If you need less than 32 bits, use a bitmask.
+   * For example, if you need only 10 bits, do
+   * <code>h = (h & hashmask(10));</code>
+   * In which case, the hash table should have hashsize(10) elements.
+   *
+   * <p>If you are hashing n strings byte[][] k, do it like this:
+   * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+   *
+   * <p>By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
+   * code any way you wish, private, educational, or commercial.  It's free.
+   *
+   * <p>Use for hash table lookup, or anything where one collision in 2^^32 is
+   * acceptable.  Do NOT use for cryptographic purposes.
+  */
+  @SuppressWarnings("fallthrough")
+  private int hash(byte[] key, int start, int nbytes, int initval) {
+    int length = nbytes;
+    long a, b, c;       // We use longs because we don't have unsigned ints
+    a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK;
+    int offset = start;
+    for (; length > 12; offset += 12, length -= 12) {
+      a = (a + (key[offset + 0]    & BYTE_MASK)) & INT_MASK;
+      a = (a + (((key[offset + 1]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+      a = (a + (((key[offset + 2]  & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+      a = (a + (((key[offset + 3]  & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+      b = (b + (key[offset + 4]    & BYTE_MASK)) & INT_MASK;
+      b = (b + (((key[offset + 5]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+      b = (b + (((key[offset + 6]  & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+      b = (b + (((key[offset + 7]  & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+      c = (c + (key[offset + 8]    & BYTE_MASK)) & INT_MASK;
+      c = (c + (((key[offset + 9]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+      c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+      c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+
+      /*
+       * mix -- mix 3 32-bit values reversibly.
+       * This is reversible, so any information in (a,b,c) before mix() is
+       * still in (a,b,c) after mix().
+       *
+       * If four pairs of (a,b,c) inputs are run through mix(), or through
+       * mix() in reverse, there are at least 32 bits of the output that
+       * are sometimes the same for one pair and different for another pair.
+       *
+       * This was tested for:
+       * - pairs that differed by one bit, by two bits, in any combination
+       *   of top bits of (a,b,c), or in any combination of bottom bits of
+       *   (a,b,c).
+       * - "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+       *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+       *    is commonly produced by subtraction) look like a single 1-bit
+       *    difference.
+       * - the base values were pseudorandom, all zero but one bit set, or
+       *   all zero plus a counter that starts at zero.
+       *
+       * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+       * satisfy this are
+       *     4  6  8 16 19  4
+       *     9 15  3 18 27 15
+       *    14  9  3  7 17  3
+       * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for
+       * "differ" defined as + with a one-bit base and a two-bit delta.  I
+       * used http://burtleburtle.net/bob/hash/avalanche.html to choose
+       * the operations, constants, and arrangements of the variables.
+       *
+       * This does not achieve avalanche.  There are input bits of (a,b,c)
+       * that fail to affect some output bits of (a,b,c), especially of a.
+       * The most thoroughly mixed value is c, but it doesn't really even
+       * achieve avalanche in c.
+       *
+       * This allows some parallelism.  Read-after-writes are good at doubling
+       * the number of bits affected, so the goal of mixing pulls in the
+       * opposite direction as the goal of parallelism.  I did what I could.
+       * Rotates seem to cost as much as shifts on every machine I could lay
+       * my hands on, and rotates are much kinder to the top and bottom bits,
+       * so I used rotates.
+       *
+       * #define mix(a,b,c) \
+       * { \
+       *   a -= c;  a ^= rot(c, 4);  c += b; \
+       *   b -= a;  b ^= rot(a, 6);  a += c; \
+       *   c -= b;  c ^= rot(b, 8);  b += a; \
+       *   a -= c;  a ^= rot(c,16);  c += b; \
+       *   b -= a;  b ^= rot(a,19);  a += c; \
+       *   c -= b;  c ^= rot(b, 4);  b += a; \
+       * }
+       *
+       * mix(a,b,c);
+       */
+      a = (a - c) & INT_MASK;  a ^= rot(c, 4);  c = (c + b) & INT_MASK;
+      b = (b - a) & INT_MASK;  b ^= rot(a, 6);  a = (a + c) & INT_MASK;
+      c = (c - b) & INT_MASK;  c ^= rot(b, 8);  b = (b + a) & INT_MASK;
+      a = (a - c) & INT_MASK;  a ^= rot(c,16);  c = (c + b) & INT_MASK;
+      b = (b - a) & INT_MASK;  b ^= rot(a,19);  a = (a + c) & INT_MASK;
+      c = (c - b) & INT_MASK;  c ^= rot(b, 4);  b = (b + a) & INT_MASK;
+    }
+
+    //-------------------------------- last block: affect all 32 bits of (c)
+    switch (length) {                   // all the case statements fall through
+    case 12:
+      c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+    case 11:
+      c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+    case 10:
+      c = (c + (((key[offset + 9]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+    case  9:
+      c = (c + (key[offset + 8]    & BYTE_MASK)) & INT_MASK;
+    case  8:
+      b = (b + (((key[offset + 7]  & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+    case  7:
+      b = (b + (((key[offset + 6]  & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+    case  6:
+      b = (b + (((key[offset + 5]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+    case  5:
+      b = (b + (key[offset + 4]    & BYTE_MASK)) & INT_MASK;
+    case  4:
+      a = (a + (((key[offset + 3]  & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+    case  3:
+      a = (a + (((key[offset + 2]  & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+    case  2:
+      a = (a + (((key[offset + 1]  & BYTE_MASK) <<  8) & INT_MASK)) & INT_MASK;
+    case  1:
+      a = (a + (key[offset + 0]    & BYTE_MASK)) & INT_MASK;
+      break;
+    case  0:
+      return (int)(c & INT_MASK);
+    }
+    /*
+     * final -- final mixing of 3 32-bit values (a,b,c) into c
+     *
+     * Pairs of (a,b,c) values differing in only a few bits will usually
+     * produce values of c that look totally different.  This was tested for
+     * - pairs that differed by one bit, by two bits, in any combination
+     *   of top bits of (a,b,c), or in any combination of bottom bits of
+     *   (a,b,c).
+     *
+     * - "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+     *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+     *   is commonly produced by subtraction) look like a single 1-bit
+     *   difference.
+     *
+     * - the base values were pseudorandom, all zero but one bit set, or
+     *   all zero plus a counter that starts at zero.
+     *
+     * These constants passed:
+     *   14 11 25 16 4 14 24
+     *   12 14 25 16 4 14 24
+     * and these came close:
+     *    4  8 15 26 3 22 24
+     *   10  8 15 26 3 22 24
+     *   11  8 15 26 3 22 24
+     *
+     * #define final(a,b,c) \
+     * {
+     *   c ^= b; c -= rot(b,14); \
+     *   a ^= c; a -= rot(c,11); \
+     *   b ^= a; b -= rot(a,25); \
+     *   c ^= b; c -= rot(b,16); \
+     *   a ^= c; a -= rot(c,4);  \
+     *   b ^= a; b -= rot(a,14); \
+     *   c ^= b; c -= rot(b,24); \
+     * }
+     *
+     */
+    c ^= b; c = (c - rot(b,14)) & INT_MASK;
+    a ^= c; a = (a - rot(c,11)) & INT_MASK;
+    b ^= a; b = (b - rot(a,25)) & INT_MASK;
+    c ^= b; c = (c - rot(b,16)) & INT_MASK;
+    a ^= c; a = (a - rot(c,4))  & INT_MASK;
+    b ^= a; b = (b - rot(a,14)) & INT_MASK;
+    c ^= b; c = (c - rot(b,24)) & INT_MASK;
+
+    return (int)(c & INT_MASK);
+  }
+
+  private static long rot(long val, int pos) {
+    return ((Integer.rotateLeft(
+        (int)(val & INT_MASK), pos)) & INT_MASK);
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ *       column IN ( <list-of-values )
+ *
+ * For double, we simply layer over the implementation for long. Double.doubleToRawLongBits
+ * is used to convert a 64-bit double to a 64-bit long with bit-for-bit fidelity.
+ */
+public class CuckooSetDouble {
+  CuckooSetLong setLong;
+
+  public CuckooSetDouble(int expectedSize) {
+    setLong = new CuckooSetLong(expectedSize);
+  }
+
+  /**
+   * Return true if and only if the value x is present in the set.
+   */
+  public boolean lookup(double x) {
+    return setLong.lookup(Double.doubleToRawLongBits(x));
+  }
+
+  /**
+   * Insert a single value into the set.
+   */
+  public void insert(double x) {
+    setLong.insert(Double.doubleToRawLongBits(x));
+  }
+
+  /**
+   * Insert all values in the input array into the set.
+   */
+  public void load(double[] a) {
+    for (Double x : a) {
+      insert(x);
+    }
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ *       column IN ( <list-of-values )
+ *
+ * For details on the algorithm, see R. Pagh and F. F. Rodler, "Cuckoo Hashing,"
+ * Elsevier Science preprint, Dec. 2003. http://www.itu.dk/people/pagh/papers/cuckoo-jour.pdf.
+ *
+ */
+public class CuckooSetLong {
+  private long t1[];
+  private long t2[];
+  private long prev1[] = null; // used for rehashing to get last set of values
+  private long prev2[] = null; // " "
+  private int n; // current array size
+  private static final double PADDING_FACTOR = 1.0/0.40; // have minimum 40% fill factor
+  private int salt[] = new int[6];
+  private Random gen = new Random(676983475);
+  private long blank = Long.MIN_VALUE;
+  private int rehashCount = 0;
+
+  // some prime numbers spaced about at powers of 2 in magnitude
+  public static int primes[] = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 1019, 2089,
+      4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 4074073,
+      8321801, 15485863, 32452867, 67867967, 122949829, 256203221, 553105253, 982451653,
+      1645333507, 2147483647};
+
+  /**
+   * Allocate a new set to hold expectedSize values. Re-allocation to expand
+   * the set is not implemented, so the expected size must be at least the
+   * size of the set to be inserteed.
+   * @param expectedSize At least the size of the set of values that will be inserted.
+   */
+  public CuckooSetLong(int expectedSize) {
+
+    // Choose array size. We have two hash tables to hold entries, so the sum
+    // of the two should have a bit more than twice as much space as the
+    // minimum required.
+    n = (int) (expectedSize * PADDING_FACTOR / 2.0);
+
+    // try to get prime number table size to have less dependence on good hash function
+    for (int i = 0; i != primes.length; i++) {
+      if (n <= primes[i]) {
+        n = primes[i];
+        break;
+      }
+    }
+
+    t1 = new long[n];
+    t2 = new long[n];
+    Arrays.fill(t1, blank);
+    Arrays.fill(t2, blank);
+    updateHashSalt();
+  }
+
+  /**
+   * Return true if and only if the value x is present in the set.
+   */
+  public boolean lookup(long x) {
+
+    /* Must check that x is not blank because otherwise you could
+     * get a false positive if the blank value was a value you
+     * were legitimately testing to see if it was in the set.
+     */
+    return x != blank && (t1[h1(x)] == x || t2[h2(x)] == x);
+  }
+
+  public void insert(long x) {
+    if (x == blank) {
+      findNewBlank();
+    }
+    long temp;
+    if (lookup(x)) {
+      return;
+    }
+
+    // Try to insert up to n times. Rehash if that fails.
+    for(int i = 0; i != n; i++) {
+      if (t1[h1(x)] == blank) {
+        t1[h1(x)] = x;
+        return;
+      }
+
+      // swap x and t1[h1(x)]
+      temp = t1[h1(x)];
+      t1[h1(x)] = x;
+      x = temp;
+
+      if (t2[h2(x)] == blank) {
+        t2[h2(x)] = x;
+        return;
+      }
+
+      // swap x and t2[h2(x)]
+      temp = t2[h2(x)];
+      t2[h2(x)] = x;
+      x = temp;
+    }
+    rehash();
+    insert(x);
+  }
+
+  /**
+   * Insert all values in the input array into the set.
+   */
+  public void load(long[] a) {
+    for (Long x : a) {
+      insert(x);
+    }
+  }
+
+  /**
+   * Need to change current blank value to something else because it is in
+   * the input data set.
+   */
+  private void findNewBlank() {
+    long newBlank = gen.nextLong();
+    while(newBlank == blank || lookup(newBlank)) {
+      newBlank = gen.nextLong();
+    }
+
+    // replace existing blanks with new blanks
+    for(int i = 0; i != n; i++) {
+      if (t1[i] == blank) {
+        t1[i] = newBlank;
+      }
+      if (t2[i] == blank) {
+        t2[i] = newBlank;
+      }
+    }
+    blank = newBlank;
+  }
+
+  /**
+   * Try to insert with up to n value's "poked out". Return the last value poked out.
+   * If the value is not blank then we assume there was a cycle.
+   * Don't try to insert the same value twice. This is for use in rehash only,
+   * so you won't see the same value twice.
+   */
+  private long tryInsert(long x) {
+    long temp;
+
+    for(int i = 0; i != n; i++) {
+      if (t1[h1(x)] == blank) {
+        t1[h1(x)] = x;
+        return blank;
+      }
+
+      // swap x and t1[h1(x)]
+      temp = t1[h1(x)];
+      t1[h1(x)] = x;
+      x = temp;
+
+      if (t2[h2(x)] == blank) {
+        t2[h2(x)] = x;
+        return blank;
+      }
+
+      // swap x and t2[h2(x)]
+      temp = t2[h2(x)];
+      t2[h2(x)] = x;
+      x = temp;
+      if (x == blank) {
+        break;
+      }
+    }
+    return x;
+  }
+
+
+
+  /**
+   * Variation of Robert Jenkins' hash function.
+   */
+  private int h1(long y) {
+    int x = (int) ((((y >>> 32) ^ y)) & 0xFFFFFFFF);
+    x = (x + salt[0]) + (x << 12);
+    x = (x ^ salt[1]) ^ (x >> 19);
+    x = (x + salt[2]) + (x << 5);
+    x = (x + salt[3]) ^ (x << 9);
+    x = (x + salt[4]) + (x << 3);
+    x = (x ^ salt[5]) ^ (x >> 16);
+
+    // Return value modulo n but always in the positive range (0..n-1).
+    // And with the mask to zero the sign bit to make the input to mod positive
+    // so the output will definitely be positive.
+    return (x & 0x7FFFFFFF) % n;
+  }
+
+  /**
+   * basic modular hash function
+   */
+  private int h2(long x) {
+
+    // Return value modulo n but always in the positive range (0..n-1).
+    // Since n is prime, this gives good spread for numbers that are multiples
+    // of one billion, which is important since timestamps internally
+    // are stored as a number of nanoseconds, and the fractional seconds
+    // part is often 0.
+    return (((int) (x % n)) + n) % n;
+  }
+
+  /**
+   * In case of rehash, hash function h2 is changed by updating the
+   * entries in the salt array with new random values.
+   */
+  private void updateHashSalt() {
+    for (int i = 0; i != 6; i++) {
+      salt[i] = gen.nextInt(0x7FFFFFFF);
+    }
+  }
+
+  private void rehash() {
+    rehashCount++;
+    if (rehashCount > 20) {
+      throw new RuntimeException("Too many rehashes");
+    }
+    updateHashSalt();
+
+    // Save original values
+    if (prev1 == null) {
+      prev1 = t1;
+      prev1 = t2;
+    }
+    t1 = new long[n];
+    t2 = new long[n];
+    Arrays.fill(t1, blank);
+    Arrays.fill(t2, blank);
+    for (Long v  : prev1) {
+      if (v != blank) {
+        long x = tryInsert(v);
+        if (x != blank) {
+          rehash();
+          return;
+        }
+      }
+    }
+    for (Long v  : prev2) {
+      if (v != blank) {
+        long x = tryInsert(v);
+        if (x != blank) {
+          rehash();
+          return;
+        }
+      }
+    }
+
+    // We succeeded in adding all the values, so
+    // clear the previous values recorded.
+    prev1 = null;
+    prev2 = null;
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,180 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate IN filter on a batch for a vector of doubles.
+ */
+public class FilterDoubleColumnInList extends VectorExpression {
+  private static final long serialVersionUID = 1L;
+  private int inputCol;
+  private double[] inListValues;
+
+  // The set object containing the IN list. This is optimized for lookup
+  // of the data type of the column.
+  private transient CuckooSetDouble inSet;
+
+  public FilterDoubleColumnInList() {
+    super();
+    inSet = null;
+  }
+
+  /**
+   * After construction you must call setInListValues() to add the values to the IN set.
+   */
+  public FilterDoubleColumnInList(int colNum) {
+    this.inputCol = colNum;
+    inSet = null;
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    if (inSet == null) {
+      inSet = new CuckooSetDouble(inListValues.length);
+      inSet.load(inListValues);
+    }
+
+    DoubleColumnVector inputColVector = (DoubleColumnVector) batch.cols[inputCol];
+    int[] sel = batch.selected;
+    boolean[] nullPos = inputColVector.isNull;
+    int n = batch.size;
+    double[] vector = inputColVector.vector;
+
+    // return immediately if batch is empty
+    if (n == 0) {
+      return;
+    }
+
+    if (inputColVector.noNulls) {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero
+        // Repeating property will not change.
+
+        if (!(inSet.lookup(vector[0]))) {
+          //Entire batch is filtered out.
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          if (inSet.lookup(vector[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (inSet.lookup(vector[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    } else {
+      if (inputColVector.isRepeating) {
+        //All must be selected otherwise size would be zero
+        //Repeating property will not change.
+        if (!nullPos[0]) {
+          if (!inSet.lookup(vector[0])) {
+            //Entire batch is filtered out.
+            batch.size = 0;
+          }
+        } else {
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (!nullPos[i]) {
+           if (inSet.lookup(vector[i])) {
+             sel[newSize++] = i;
+           }
+          }
+        }
+
+        // Change the selected vector
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (!nullPos[i]) {
+            if (inSet.lookup(vector[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    }
+  }
+
+
+  @Override
+  public String getOutputType() {
+    return "boolean";
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return -1;
+  }
+
+  @Override
+  public Descriptor getDescriptor() {
+
+    // This VectorExpression (IN) is a special case, so don't return a descriptor.
+    return null;
+  }
+
+  public double[] getInListValues() {
+    return this.inListValues;
+  }
+
+  public void setInListValues(double [] a) {
+    this.inListValues = a;
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate IN filter on a batch for a vector of longs.
+ */
+public class FilterLongColumnInList extends VectorExpression {
+  private static final long serialVersionUID = 1L;
+  private int inputCol;
+  private long[] inListValues;
+
+  // The set object containing the IN list. This is optimized for lookup
+  // of the data type of the column.
+  private transient CuckooSetLong inSet;
+
+  public FilterLongColumnInList() {
+    super();
+    inSet = null;
+  }
+
+  /**
+   * After construction you must call setInListValues() to add the values to the IN set.
+   */
+  public FilterLongColumnInList(int colNum) {
+    this.inputCol = colNum;
+    inSet = null;
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    if (inSet == null) {
+      inSet = new CuckooSetLong(inListValues.length);
+      inSet.load(inListValues);
+    }
+
+    LongColumnVector inputColVector = (LongColumnVector) batch.cols[inputCol];
+    int[] sel = batch.selected;
+    boolean[] nullPos = inputColVector.isNull;
+    int n = batch.size;
+    long[] vector = inputColVector.vector;
+
+    // return immediately if batch is empty
+    if (n == 0) {
+      return;
+    }
+
+    if (inputColVector.noNulls) {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero
+        // Repeating property will not change.
+
+        if (!(inSet.lookup(vector[0]))) {
+          //Entire batch is filtered out.
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          if (inSet.lookup(vector[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (inSet.lookup(vector[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    } else {
+      if (inputColVector.isRepeating) {
+        //All must be selected otherwise size would be zero
+        //Repeating property will not change.
+        if (!nullPos[0]) {
+          if (!inSet.lookup(vector[0])) {
+            //Entire batch is filtered out.
+            batch.size = 0;
+          }
+        } else {
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (!nullPos[i]) {
+           if (inSet.lookup(vector[i])) {
+             sel[newSize++] = i;
+           }
+          }
+        }
+
+        // Change the selected vector
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (!nullPos[i]) {
+            if (inSet.lookup(vector[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    }
+  }
+
+
+  @Override
+  public String getOutputType() {
+    return "boolean";
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return -1;
+  }
+
+  @Override
+  public Descriptor getDescriptor() {
+
+    // This VectorExpression (IN) is a special case, so don't return a descriptor.
+    return null;
+  }
+
+  public long[] getInListValues() {
+    return this.inListValues;
+  }
+
+  public void setInListValues(long [] a) {
+    this.inListValues = a;
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate an IN filter on a batch for a vector of strings.
+ * This is optimized so that no objects have to be created in
+ * the inner loop, and there is a hash table implemented
+ * with Cuckoo hashing that has fast lookup to do the IN test.
+ */
+public class FilterStringColumnInList extends VectorExpression {
+  private static final long serialVersionUID = 1L;
+  private int inputCol;
+  private byte[][] inListValues;
+
+  // The set object containing the IN list. This is optimized for lookup
+  // of the data type of the column.
+  private transient CuckooSetBytes inSet;
+
+  public FilterStringColumnInList() {
+    super();
+    inSet = null;
+  }
+
+  /**
+   * After construction you must call setInListValues() to add the values to the IN set.
+   */
+  public FilterStringColumnInList(int colNum) {
+    this.inputCol = colNum;
+    inSet = null;
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    if (inSet == null) {
+      inSet = new CuckooSetBytes(inListValues.length);
+      inSet.load(inListValues);
+    }
+
+    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[inputCol];
+    int[] sel = batch.selected;
+    boolean[] nullPos = inputColVector.isNull;
+    int n = batch.size;
+    byte[][] vector = inputColVector.vector;
+    int[] start = inputColVector.start;
+    int[] len = inputColVector.length;
+
+    // return immediately if batch is empty
+    if (n == 0) {
+      return;
+    }
+
+    if (inputColVector.noNulls) {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero
+        // Repeating property will not change.
+        if (!(inSet.lookup(vector[0], start[0], len[0]))) {
+
+          // Entire batch is filtered out.
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (inSet.lookup(vector[i], start[i], len[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (inSet.lookup(vector[i], start[i], len[i])) {
+            sel[newSize++] = i;
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    } else {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero
+        // Repeating property will not change.
+        if (!nullPos[0]) {
+          if (!inSet.lookup(vector[0], start[0], len[0])) {
+
+            // Entire batch is filtered out.
+            batch.size = 0;
+          }
+        } else {
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (!nullPos[i]) {
+           if (inSet.lookup(vector[i], start[i], len[i] )) {
+             sel[newSize++] = i;
+           }
+          }
+        }
+
+        // Change the selected vector
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for(int i = 0; i != n; i++) {
+          if (!nullPos[i]) {
+            if (inSet.lookup(vector[i], start[i], len[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    }
+  }
+
+
+  @Override
+  public String getOutputType() {
+    return "boolean";
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return -1;
+  }
+
+  @Override
+  public Descriptor getDescriptor() {
+
+    // This VectorExpression (IN) is a special case, so don't return a descriptor.
+    return null;
+  }
+
+  public byte[][] getInListValues() {
+    return this.inListValues;
+  }
+
+  public void setInListValues(byte [][] a) {
+    this.inListValues = a;
+  }
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Wed Nov  6 16:43:36 2013
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.udf.gen
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
@@ -172,6 +173,7 @@ public class Vectorizer implements Physi
     supportedGenericUDFs.add(GenericUDFConcat.class);
     supportedGenericUDFs.add(GenericUDFAbs.class);
     supportedGenericUDFs.add(GenericUDFBetween.class);
+    supportedGenericUDFs.add(GenericUDFIn.class);
 
     // For type casts
     supportedGenericUDFs.add(UDFToLong.class);

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java Wed Nov  6 16:43:36 2013
@@ -92,6 +92,7 @@ import org.apache.hadoop.hive.ql.udf.UDF
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
@@ -939,4 +940,42 @@ public class TestVectorizationContext {
     ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
     assertTrue(ve instanceof FilterDoubleColumnNotBetween);
   }
+
+  @Test
+  public void testInFilters() throws HiveException {
+    ExprNodeColumnDesc col1Expr = new  ExprNodeColumnDesc(String.class, "col1", "table", false);
+    ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc("Alpha");
+    ExprNodeConstantDesc constDesc2 = new ExprNodeConstantDesc("Bravo");
+
+    // string IN
+    GenericUDFIn udf = new GenericUDFIn();
+    ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc();
+    exprDesc.setGenericUDF(udf);
+    List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>();
+    children1.add(col1Expr);
+    children1.add(constDesc);
+    children1.add(constDesc2);
+    exprDesc.setChildren(children1);
+
+    Map<String, Integer> columnMap = new HashMap<String, Integer>();
+    columnMap.put("col1", 1);
+    columnMap.put("col2", 2);
+    VectorizationContext vc = new VectorizationContext(columnMap, 2);
+    VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+    assertTrue(ve instanceof FilterStringColumnInList);
+
+    // long IN
+    children1.set(0, new ExprNodeColumnDesc(Long.class, "col1", "table", false));
+    children1.set(1, new ExprNodeConstantDesc(10));
+    children1.set(2, new ExprNodeConstantDesc(20));
+    ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+    assertTrue(ve instanceof FilterLongColumnInList);
+
+    // double IN
+    children1.set(0, new ExprNodeColumnDesc(Double.class, "col1", "table", false));
+    children1.set(1, new ExprNodeConstantDesc(10d));
+    children1.set(2, new ExprNodeConstantDesc(20d));
+    ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+    assertTrue(ve instanceof FilterDoubleColumnInList);
+  }
 }

Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java (added)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java Wed Nov  6 16:43:36 2013
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import static org.junit.Assert.*;
+
+import java.util.Random;
+
+import org.junit.Test;
+
+public class TestCuckooSet {
+
+  // maximum table size
+  private static int MAX_SIZE = 65437;
+
+  @Test
+  public void testSetLong () {
+
+    // Set of values to look for. Include the original blank value Long.MIN_VALUE to make sure
+    // the process of choosing a new blank works.
+    Long[] values = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L,
+        22L, 23L, 24L, 25L, -26L, 27L,
+        28L, 29L, 30L, 111111111111111L, -444444444444444L, Long.MIN_VALUE};
+    Long[] negatives = {0L, 4L, 4000L, -2L, 19L, 222222222222222L, -333333333333333L};
+    CuckooSetLong s = new CuckooSetLong(values.length);
+    for(Long v : values) {
+      s.insert(v);
+    }
+
+    // test that the values we added are there
+    for(Long v : values) {
+      assertTrue(s.lookup(v));
+    }
+
+    // test that values that we know are missing are shown to be absent
+    for (Long v : negatives) {
+      assertFalse(s.lookup(v));
+    }
+
+    // Set of values to look for.
+    Long[] values2 = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L, 22L, 23L,
+        24L, 25L, -26L, 27L,
+        28L, 29L, 30L, 111111111111111L, -444444444444444L};
+
+    // Include the original blank value Long.MIN_VALUE in the negatives to make sure we get
+    // the correct result that the blank value is not there.
+    Long[] negatives2 = {0L, 4L, 4000L, -2L, 19L, 222222222222222L,
+        -333333333333333L, Long.MIN_VALUE};
+    s = new CuckooSetLong(values2.length);
+    for(Long v : values2) {
+      s.insert(v);
+    }
+
+    // test that the values we added are there
+    for(Long v : values2) {
+      assertTrue(s.lookup(v));
+    }
+
+    // test that values that we know are missing are shown to be absent
+    for (Long v : negatives2) {
+      assertFalse(s.lookup(v));
+    }
+
+  }
+
+  // load multiple random sets of Long values
+  @Test
+  public void testSetLongRandom() {
+    long[] values;
+    Random gen = new Random(98763537);
+    for(int i = 0; i < 200;) {
+
+      // Make a random array of longs
+      int size = gen.nextInt() % MAX_SIZE;
+      if (size <= 0) {   // ensure size is >= 1, otherwise try again
+        continue;
+      }
+      i++;
+      values = new long[size];
+      loadRandom(values, gen);
+
+      // load them into a SetLong
+      CuckooSetLong s = new CuckooSetLong(size);
+      loadSet(s, values);
+
+      // look them up to make sure they are all there
+      for (int j = 0; j != size; j++) {
+        assertTrue(s.lookup(values[j]));
+      }
+    }
+  }
+
+  @Test
+  public void testSetDouble() {
+
+    // Set of values to look for.
+    Double[] values = {7021.0D, 5780.0D, 0D, -1D, 1.999e50D};
+    Double[] negatives = {7000.0D, -2D, 1.9999e50D};
+    CuckooSetDouble s = new CuckooSetDouble(values.length);
+    for(Double v : values) {
+      s.insert(v);
+    }
+
+    // test that the values we added are there
+    for(Double v : values) {
+      assertTrue(s.lookup(v));
+    }
+
+    // test that values that we know are missing are shown to be absent
+    for (Double v : negatives) {
+      assertFalse(s.lookup(v));
+    }
+  }
+
+  @Test
+  public void testSetBytes() {
+    String[] strings = {"foo", "bar", "baz", "a", "", "x1341", "Z"};
+    String[] negativeStrings = {"not", "in", "the", "set", "foobar"};
+    byte[][] values = getByteArrays(strings);
+    byte[][] negatives = getByteArrays(negativeStrings);
+
+    // load set
+    CuckooSetBytes s = new CuckooSetBytes(strings.length);
+    for(byte[] v : values) {
+      s.insert(v);
+    }
+
+    // test that the values we added are there
+    for(byte[] v : values) {
+      assertTrue(s.lookup(v, 0, v.length));
+    }
+
+    // test that values that we know are missing are shown to be absent
+    for (byte[] v : negatives) {
+      assertFalse(s.lookup(v, 0, v.length));
+    }
+
+    // Test that we can search correctly using a buffer and pulling
+    // a sequence of bytes out of the middle of it. In this case it
+    // is the 3 letter sequence "foo".
+    byte[] buf = getUTF8Bytes("thewordfooisinhere");
+    assertTrue(s.lookup(buf, 7, 3));
+  }
+
+  @Test
+  public void testSetBytesLargeRandom() {
+    byte[][] values;
+    Random gen = new Random(98763537);
+    for(int i = 0; i < 200;) {
+
+      // Make a random array of byte arrays
+      int size = gen.nextInt() % MAX_SIZE;
+      if (size <= 0) {   // ensure size is >= 1, otherwise try again
+        continue;
+      }
+      i++;
+      values = new byte[size][];
+      loadRandomBytes(values, gen);
+
+      // load them into a set
+      CuckooSetBytes s = new CuckooSetBytes(size);
+      loadSet(s, values);
+
+      // look them up to make sure they are all there
+      for (int j = 0; j != size; j++) {
+        assertTrue(s.lookup(values[j], 0, values[j].length));
+      }
+    }
+  }
+
+  public void loadRandomBytes(byte[][] values, Random gen) {
+    for (int i = 0; i != values.length; i++) {
+      values[i] = getUTF8Bytes(Integer.toString(gen.nextInt()));
+    }
+  }
+
+  private byte[] getUTF8Bytes(String s) {
+    byte[] v = null;
+    try {
+      v = s.getBytes("UTF-8");
+    } catch (Exception e) {
+      ; // won't happen
+    }
+    return v;
+  }
+
+  // Get an array of UTF-8 byte arrays from an array of strings
+  private byte[][] getByteArrays(String[] strings) {
+    byte[][] values = new byte[strings.length][];
+    for(int i = 0; i != strings.length; i++) {
+      try {
+        values[i] = strings[i].getBytes("UTF-8");
+      } catch (Exception e) {
+        ; // can't happen
+      }
+    }
+    return values;
+  }
+
+  private void loadSet(CuckooSetLong s, long[] values) {
+    for (Long v : values) {
+      s.insert(v);
+    }
+  }
+
+  private void loadSet(CuckooSetBytes s, byte[][] values) {
+    for (byte[] v: values) {
+      s.insert(v);
+    }
+  }
+
+  private void loadRandom(long[] a, Random gen)  {
+    int size = a.length;
+    for(int i = 0; i != size; i++) {
+      a[i] = gen.nextLong();
+    }
+  }
+}

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java Wed Nov  6 16:43:36 2013
@@ -543,4 +543,189 @@ public class TestVectorFilterExpressions
     assertTrue(vrb.selectedInUse);
     assertEquals(0, vrb.selected[0]);
   }
+
+  /**
+   * Test the IN filter VectorExpression classes.
+   */
+
+  @Test
+  public void testFilterLongIn() {
+    int seed = 17;
+    VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        5, 2, seed);
+    LongColumnVector lcv0 = (LongColumnVector) vrb.cols[0];
+    long[] inList = {5, 20};
+    FilterLongColumnInList f = new FilterLongColumnInList(0);
+    f.setInListValues(inList);
+    VectorExpression expr1 = f;
+
+    // Basic case
+    lcv0.vector[0] = 5;
+    lcv0.vector[1] = 20;
+    lcv0.vector[2] = 17;
+    lcv0.vector[3] = 15;
+    lcv0.vector[4] = 10;
+
+    expr1.evaluate(vrb);
+
+    assertEquals(2, vrb.size);
+    assertTrue(vrb.selectedInUse);
+    assertEquals(0, vrb.selected[0]);
+    assertEquals(1, vrb.selected[1]);
+
+    // With nulls
+    VectorizedRowBatch vrb1 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        5, 2, seed);
+
+    lcv0 = (LongColumnVector) vrb1.cols[0];
+
+    lcv0.vector[0] = 5;
+    lcv0.vector[1] = 20;
+    lcv0.vector[2] = 17;
+    lcv0.vector[3] = 15;
+    lcv0.vector[4] = 10;
+
+    lcv0.noNulls = false;
+    lcv0.isNull[0] = true;
+    lcv0.isNull[2] = true;
+
+    expr1.evaluate(vrb1);
+    assertEquals(1, vrb1.size);
+    assertTrue(vrb1.selectedInUse);
+    assertEquals(1, vrb1.selected[0]);
+
+    // With nulls and selected
+    VectorizedRowBatch vrb2 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        7, 2, seed);
+    vrb2.selectedInUse = true;
+    vrb2.selected[0] = 1;
+    vrb2.selected[1] = 2;
+    vrb2.selected[2] = 4;
+    vrb2.size = 3;
+
+    lcv0 = (LongColumnVector) vrb2.cols[0];
+
+    lcv0.vector[0] = 5;
+    lcv0.vector[1] = 20;
+    lcv0.vector[2] = 17;
+    lcv0.vector[3] = 15;
+    lcv0.vector[4] = 10;
+    lcv0.vector[5] = 19;
+    lcv0.vector[6] = 21;
+
+    lcv0.noNulls = false;
+    lcv0.isNull[0] = true;
+    lcv0.isNull[2] = true;
+    lcv0.isNull[5] = true;
+
+    expr1.evaluate(vrb2);
+    assertEquals(1, vrb2.size);
+    assertEquals(1, vrb2.selected[0]);
+
+    // Repeating non null
+    VectorizedRowBatch vrb3 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        7, 2, seed);
+    lcv0 = (LongColumnVector) vrb3.cols[0];
+
+    lcv0.isRepeating = true;
+    lcv0.vector[0] = 5;
+    lcv0.vector[1] = 20;
+    lcv0.vector[2] = 17;
+    lcv0.vector[3] = 15;
+    lcv0.vector[4] = 10;
+
+    expr1.evaluate(vrb3);
+    assertEquals(7, vrb3.size);
+    assertFalse(vrb3.selectedInUse);
+    assertTrue(lcv0.isRepeating);
+
+    // Repeating null
+    lcv0.noNulls = false;
+    lcv0.vector[0] = 5;
+    lcv0.isNull[0] = true;
+
+    expr1.evaluate(vrb3);
+    assertEquals(0, vrb3.size);
+  }
+
+  @Test
+  public void testFilterDoubleIn() {
+    int seed = 17;
+    VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        5, 2, seed);
+    DoubleColumnVector dcv0 = new DoubleColumnVector();
+    vrb.cols[0] = dcv0;
+    double[] inList = {5.0, 20.2};
+    FilterDoubleColumnInList f = new FilterDoubleColumnInList(0);
+    f.setInListValues(inList);
+    VectorExpression expr1 = f;
+
+    // Basic sanity check. Other cases are not skipped because it is similar to the case for Long.
+    dcv0.vector[0] = 5.0;
+    dcv0.vector[1] = 20.2;
+    dcv0.vector[2] = 17.0;
+    dcv0.vector[3] = 15.0;
+    dcv0.vector[4] = 10.0;
+
+    expr1.evaluate(vrb);
+
+    assertEquals(2, vrb.size);
+    assertTrue(vrb.selectedInUse);
+    assertEquals(0, vrb.selected[0]);
+    assertEquals(1, vrb.selected[1]);
+  }
+
+  @Test
+  public void testFilterStringIn() {
+    int seed = 17;
+    VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+        3, 2, seed);
+    vrb.cols[0] = new BytesColumnVector();
+    BytesColumnVector bcv = (BytesColumnVector) vrb.cols[0];
+
+    bcv.initBuffer();
+    bcv.setVal(0, a, 0, 1);
+    bcv.setVal(1, b, 0, 1);
+    bcv.setVal(2, c, 0, 1);
+
+    VectorExpression expr = new FilterStringColumnInList(0);
+    byte[][] inList = {b, c};
+    ((FilterStringColumnInList) expr).setInListValues(inList);
+
+    // basic test
+    expr.evaluate(vrb);
+
+    assertEquals(2, vrb.size);
+    assertTrue(vrb.selectedInUse);
+    assertEquals(1, vrb.selected[0]);
+    assertEquals(2, vrb.selected[1]);
+
+    // nulls
+    vrb.selectedInUse = false;
+    vrb.size = 3;
+    bcv.noNulls = false;
+    bcv.isNull[2] = true;
+    expr.evaluate(vrb);
+    assertEquals(1, vrb.size);
+    assertEquals(1, vrb.selected[0]);
+    assertTrue(vrb.selectedInUse);
+
+    // repeating
+    vrb.selectedInUse = false;
+    vrb.size = 3;
+    bcv.noNulls = true;
+    bcv.isRepeating = true;
+    expr.evaluate(vrb);
+    assertEquals(0, vrb.size);
+
+    // nulls and repeating
+    vrb.selectedInUse = false;
+    vrb.size = 3;
+    bcv.noNulls = false;
+    bcv.isRepeating = true;
+    bcv.isNull[0] = true;
+    bcv.setVal(0, b, 0, 1);
+    expr.evaluate(vrb);
+    assertEquals(0, vrb.size);
+  }
 }