You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/11/06 17:43:36 UTC
svn commit: r1539392 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/
java/org/apache/hadoop/hive/ql/exec/vector/expressions/
java/org/apache/hadoop/hive/ql/optimizer/physical/
test/org/apache/hadoop/hive/ql/exec/vector/ test/org/ap...
Author: hashutosh
Date: Wed Nov 6 16:43:36 2013
New Revision: 1539392
URL: http://svn.apache.org/r1539392
Log:
HIVE-5583 : Implement support for IN (list-of-constants) filter in vectorized mode (Eric Hanson via Ashutosh Chauhan)
Added:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java Wed Nov 6 16:43:36 2013
@@ -76,6 +76,7 @@ import org.apache.hadoop.hive.ql.plan.Ex
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
import org.apache.hadoop.hive.ql.udf.UDFConv;
import org.apache.hadoop.hive.ql.udf.UDFHex;
import org.apache.hadoop.hive.ql.udf.UDFOPNegative;
@@ -557,6 +558,8 @@ public class VectorizationContext {
//First handle special cases
if (udf instanceof GenericUDFBetween) {
return getBetweenFilterExpression(childExpr);
+ } else if (udf instanceof GenericUDFIn) {
+ return getInFilterExpression(childExpr);
} else if (udf instanceof GenericUDFBridge) {
VectorExpression v = getGenericUDFBridgeVectorExpression((GenericUDFBridge) udf, childExpr, mode);
if (v != null) {
@@ -579,6 +582,104 @@ public class VectorizationContext {
}
/**
+ * Create a filter expression for column IN ( <list-of-constants> )
+ * @param childExpr
+ * @return
+ */
+ private VectorExpression getInFilterExpression(List<ExprNodeDesc> childExpr)
+ throws HiveException {
+ ExprNodeDesc colExpr = childExpr.get(0);
+ String colType = colExpr.getTypeString();
+
+ // prepare arguments for createVectorExpression
+ List<ExprNodeDesc> childrenForInList =
+ foldConstantsForUnaryExprs(childExpr.subList(1, childExpr.size()));
+
+ // Remove nulls. This is safe because "value IN ( <list> )" is never true for a NULL member
+ // of <list>, under SQL semantics, because value = NULL is always false.
+ childrenForInList = removeNullListEntries(childrenForInList);
+ VectorExpression expr = null;
+
+ // determine class
+ Class<?> cl = null;
+ if (isIntFamily(colType)) {
+ cl = FilterLongColumnInList.class;
+ long[] inVals = new long[childrenForInList.size()];
+ for (int i = 0; i != inVals.length; i++) {
+ inVals[i] = getIntFamilyScalarAsLong((ExprNodeConstantDesc) childrenForInList.get(i));
+ }
+ FilterLongColumnInList f = (FilterLongColumnInList)
+ createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+ f.setInListValues(inVals);
+ expr = f;
+ } else if (colType.equals("timestamp")) {
+ cl = FilterLongColumnInList.class;
+ long[] inVals = new long[childrenForInList.size()];
+ for (int i = 0; i != inVals.length; i++) {
+ inVals[i] = getTimestampScalar(childrenForInList.get(i));
+ }
+ FilterLongColumnInList f = (FilterLongColumnInList)
+ createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+ f.setInListValues(inVals);
+ expr = f;
+ } else if (colType.equals("string")) {
+ cl = FilterStringColumnInList.class;
+ byte[][] inVals = new byte[childrenForInList.size()][];
+ for (int i = 0; i != inVals.length; i++) {
+ inVals[i] = getStringScalarAsByteArray((ExprNodeConstantDesc) childrenForInList.get(i));
+ }
+ FilterStringColumnInList f =(FilterStringColumnInList)
+ createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+ f.setInListValues(inVals);
+ expr = f;
+ } else if (isFloatFamily(colType)) {
+ cl = FilterDoubleColumnInList.class;
+ double[] inValsD = new double[childrenForInList.size()];
+ for (int i = 0; i != inValsD.length; i++) {
+ inValsD[i] = getNumericScalarAsDouble(childrenForInList.get(i));
+ }
+ FilterDoubleColumnInList f = (FilterDoubleColumnInList)
+ createVectorExpression(cl, childExpr.subList(0, 1), Mode.PROJECTION);
+ f.setInListValues(inValsD);
+ expr = f;
+ } else {
+ throw new HiveException("Type " + colType + " not supported for IN in vectorized mode");
+ }
+ return expr;
+ }
+
+ // Return a version of the input IN list with the NULL entries removed.
+ private List<ExprNodeDesc> removeNullListEntries(List<ExprNodeDesc> childrenForInList) {
+ boolean hasNulls = false;
+ for (ExprNodeDesc e : childrenForInList) {
+ if (e instanceof ExprNodeNullDesc) {
+ hasNulls = true;
+ break;
+ }
+ }
+ if (!hasNulls) {
+ return childrenForInList;
+ } else {
+ List<ExprNodeDesc> nullFreeList = new ArrayList<ExprNodeDesc>();
+ for (ExprNodeDesc e : childrenForInList) {
+ if (!(e instanceof ExprNodeNullDesc)) {
+ nullFreeList.add(e);
+ }
+ }
+ return nullFreeList;
+ }
+ }
+
+ private byte[] getStringScalarAsByteArray(ExprNodeConstantDesc exprNodeConstantDesc)
+ throws HiveException {
+ Object o = getScalarValue(exprNodeConstantDesc);
+ if (!(o instanceof byte[])) {
+ throw new HiveException("Expected constant argument of type string");
+ }
+ return (byte[]) o;
+ }
+
+ /**
* Invoke special handling for expressions that can't be vectorized by regular
* descriptor based lookup.
*/
@@ -850,8 +951,38 @@ public class VectorizationContext {
}
}
- // Get a timestamp as a long in number of nanos, from a string constant.
+ private long getIntFamilyScalarAsLong(ExprNodeConstantDesc constDesc)
+ throws HiveException {
+ Object o = getScalarValue(constDesc);
+ if (o instanceof Integer) {
+ return (Integer) o;
+ } else if (o instanceof Long) {
+ return (Long) o;
+ }
+ throw new HiveException("Unexpected type when converting to long");
+ }
+
+ private double getNumericScalarAsDouble(ExprNodeDesc constDesc)
+ throws HiveException {
+ Object o = getScalarValue((ExprNodeConstantDesc) constDesc);
+ if (o instanceof Double) {
+ return (Double) o;
+ } else if (o instanceof Float) {
+ return (Float) o;
+ } else if (o instanceof Integer) {
+ return (Integer) o;
+ } else if (o instanceof Long) {
+ return (Long) o;
+ }
+ throw new HiveException("Unexpected type when converting to double");
+ }
+
+ // Get a timestamp as a long in number of nanos, from a string constant or cast
private long getTimestampScalar(ExprNodeDesc expr) throws HiveException {
+ if (expr instanceof ExprNodeGenericFuncDesc &&
+ ((ExprNodeGenericFuncDesc) expr).getGenericUDF() instanceof GenericUDFTimestamp) {
+ return evaluateCastToTimestamp(expr);
+ }
if (!(expr instanceof ExprNodeConstantDesc)) {
throw new HiveException("Constant timestamp value expected for expression argument. " +
"Non-constant argument not supported for vectorization.");
@@ -868,25 +999,29 @@ public class VectorizationContext {
expr2.setChildren(children);
// initialize and evaluate
- ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2);
- ObjectInspector output = evaluator.initialize(null);
- Object constant = evaluator.evaluate(null);
- Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
-
- if (!(java instanceof Timestamp)) {
- throw new HiveException("Udf: failed to convert from string to timestamp");
- }
- Timestamp ts = (Timestamp) java;
- long result = ts.getTime();
- result *= 1000000; // shift left 6 digits to make room for nanos below ms precision
- result += ts.getNanos() % 1000000; // add in nanos, after removing the ms portion
- return result;
+ return evaluateCastToTimestamp(expr2);
}
throw new HiveException("Udf: unhandled constant type for scalar argument. "
+ "Expecting string.");
}
+ private long evaluateCastToTimestamp(ExprNodeDesc expr) throws HiveException {
+ ExprNodeGenericFuncDesc expr2 = (ExprNodeGenericFuncDesc) expr;
+ ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2);
+ ObjectInspector output = evaluator.initialize(null);
+ Object constant = evaluator.evaluate(null);
+ Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
+
+ if (!(java instanceof Timestamp)) {
+ throw new HiveException("Udf: failed to convert to timestamp");
+ }
+ Timestamp ts = (Timestamp) java;
+ long result = ts.getTime();
+ result *= 1000000; // shift left 6 digits to make room for nanos below ms precision
+ result += ts.getNanos() % 1000000; // add in nanos, after removing the ms portion
+ return result;
+ }
private Constructor<?> getConstructor(Class<?> cl) throws HiveException {
try {
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetBytes.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ * column IN ( <list-of-values )
+ *
+ * For details on the algorithm, see R. Pagh and F. F. Rodler, "Cuckoo Hashing,"
+ * Elsevier Science preprint, Dec. 2003. http://www.itu.dk/people/pagh/papers/cuckoo-jour.pdf.
+ */
+public class CuckooSetBytes {
+ private byte t1[][];
+ private byte t2[][];
+ private byte prev1[][] = null; // used for rehashing to get last set of values
+ private byte prev2[][] = null; // " "
+ private int n; // current array size
+ private static final double PADDING_FACTOR = 1.0/0.40; // have minimum 40% fill factor
+ private int salt = 0;
+ private Random gen = new Random(676983475);
+ private int rehashCount = 0;
+ private static long INT_MASK = 0x00000000ffffffffL;
+ private static long BYTE_MASK = 0x00000000000000ffL;
+
+ /**
+ * Allocate a new set to hold expectedSize values. Re-allocation to expand
+ * the set is not implemented, so the expected size must be at least the
+ * size of the set to be inserted.
+ * @param expectedSize At least the size of the set of values that will be inserted.
+ */
+ public CuckooSetBytes(int expectedSize) {
+
+ // Choose array size. We have two hash tables to hold entries, so the sum
+ // of the two should have a bit more than twice as much space as the
+ // minimum required.
+ n = (int) (expectedSize * PADDING_FACTOR / 2.0);
+
+ // some prime numbers spaced about at powers of 2 in magnitude
+
+ // try to get prime number table size to have less dependence on good hash function
+ int primes[] = CuckooSetLong.primes;
+ for (int i = 0; i != primes.length; i++) {
+ if (n <= primes[i]) {
+ n = primes[i];
+ break;
+ }
+ }
+
+ t1 = new byte[n][];
+ t2 = new byte[n][];
+ updateHashSalt();
+ }
+
+ /**
+ * Return true if and only if the value in byte array b beginning at start
+ * and ending at start+len is present in the set.
+ */
+ public boolean lookup(byte[] b, int start, int len) {
+
+ return entryEqual(t1, h1(b, start, len), b, start, len)
+ || entryEqual(t2, h2(b, start, len), b, start, len);
+ }
+
+ private static boolean entryEqual(byte[][] t, int hash, byte[] b, int start, int len) {
+ return t[hash] != null && StringExpr.compare(t[hash], 0, t[hash].length, b, start, len) == 0;
+ }
+
+ public void insert(byte[] x) {
+ byte[] temp;
+ if (lookup(x, 0, x.length)) {
+ return;
+ }
+
+ // Try to insert up to n times. Rehash if that fails.
+ for(int i = 0; i != n; i++) {
+ int hash1 = h1(x, 0, x.length);
+ if (t1[hash1] == null) {
+ t1[hash1] = x;
+ return;
+ }
+
+ // swap x and t1[h1(x)]
+ temp = t1[hash1];
+ t1[hash1] = x;
+ x = temp;
+
+ int hash2 = h2(x, 0, x.length);
+ if (t2[hash2] == null) {
+ t2[hash2] = x;
+ return;
+ }
+
+ // swap x and t2[h2(x)]
+ temp = t2[hash2];
+ t2[hash2] = x;
+ x = temp;
+ }
+ rehash();
+ insert(x);
+ }
+
+ /**
+ * Insert all values in the input array into the set.
+ */
+ public void load(byte[][] a) {
+ for (byte[] x : a) {
+ insert(x);
+ }
+ }
+
+ /**
+ * Try to insert with up to n value's "poked out". Return the last value poked out.
+ * If the value is not blank then we assume there was a cycle.
+ * Don't try to insert the same value twice. This is for use in rehash only,
+ * so you won't see the same value twice.
+ */
+ private byte[] tryInsert(byte[] x) {
+ byte[] temp;
+
+ for(int i = 0; i != n; i++) {
+ int hash1 = h1(x, 0, x.length);
+ if (t1[hash1] == null) {
+ t1[hash1] = x;
+ return null;
+ }
+
+ // swap x and t1[h1(x)]
+ temp = t1[hash1];
+ t1[hash1] = x;
+ x = temp;
+
+ int hash2 = h2(x, 0, x.length);
+ if (t2[hash2] == null) {
+ t2[hash2] = x;
+ return null;
+ }
+
+ // swap x and t2[h2(x)]
+ temp = t2[hash2];
+ t2[hash2] = x;
+ x = temp;
+ if (x == null) {
+ break;
+ }
+ }
+ return x;
+ }
+
+ /**
+ * first hash function
+ */
+ private int h1(byte[] b, int start, int len) {
+
+ // AND hash with mask to 0 out sign bit to make sure it's positive.
+ // Then we know taking the result mod n is in the range (0..n-1).
+ return (hash(b, start, len, 0) & 0x7FFFFFFF) % n;
+ }
+
+ /**
+ * second hash function
+ */
+ private int h2(byte[] b, int start, int len) {
+
+ // AND hash with mask to 0 out sign bit to make sure it's positive.
+ // Then we know taking the result mod n is in the range (0..n-1).
+ // Include salt as argument so this hash function can be varied
+ // if we need to rehash.
+ return (hash(b, start, len, salt) & 0x7FFFFFFF) % n;
+ }
+
+ /**
+ * In case of rehash, hash function h2 is changed by updating the
+ * salt value passed in to the function hash().
+ */
+ private void updateHashSalt() {
+ salt = gen.nextInt(0x7FFFFFFF);
+ }
+
+ private void rehash() {
+ rehashCount++;
+ if (rehashCount > 20) {
+ throw new RuntimeException("Too many rehashes");
+ }
+ updateHashSalt();
+
+ // Save original values
+ if (prev1 == null) {
+ prev1 = t1;
+ prev1 = t2;
+ }
+ t1 = new byte[n][];
+ t2 = new byte[n][];
+ for (byte[] v : prev1) {
+ if (v != null) {
+ byte[] x = tryInsert(v);
+ if (x != null) {
+ rehash();
+ return;
+ }
+ }
+ }
+ for (byte[] v : prev2) {
+ if (v != null) {
+ byte[] x = tryInsert(v);
+ if (x != null) {
+ rehash();
+ return;
+ }
+ }
+ }
+
+ // We succeeded in adding all the values, so
+ // clear the previous values recorded.
+ prev1 = null;
+ prev2 = null;
+ }
+
+ /**
+ * This is adapted from the org.apache.hadoop.util.hash.JenkinsHash package.
+ * The interface needed to be modified to suit the use here, by adding
+ * a start offset parameter to the hash function.
+ *
+ * In the future, folding this back into the original Hadoop package should
+ * be considered. This could could them import that package and use it.
+ * The original comments from the source are below.
+ *
+ * taken from hashlittle() -- hash a variable-length key into a 32-bit value
+ *
+ * @param key the key (the unaligned variable-length array of bytes)
+ * @param nbytes number of bytes to include in hash
+ * @param initval can be any integer value
+ * @return a 32-bit value. Every bit of the key affects every bit of the
+ * return value. Two keys differing by one or two bits will have totally
+ * different hash values.
+ *
+ * <p>The best hash table sizes are powers of 2. There is no need to do mod
+ * a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask.
+ * For example, if you need only 10 bits, do
+ * <code>h = (h & hashmask(10));</code>
+ * In which case, the hash table should have hashsize(10) elements.
+ *
+ * <p>If you are hashing n strings byte[][] k, do it like this:
+ * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+ *
+ * <p>By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+ * code any way you wish, private, educational, or commercial. It's free.
+ *
+ * <p>Use for hash table lookup, or anything where one collision in 2^^32 is
+ * acceptable. Do NOT use for cryptographic purposes.
+ */
+ @SuppressWarnings("fallthrough")
+ private int hash(byte[] key, int start, int nbytes, int initval) {
+ int length = nbytes;
+ long a, b, c; // We use longs because we don't have unsigned ints
+ a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK;
+ int offset = start;
+ for (; length > 12; offset += 12, length -= 12) {
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+
+ /*
+ * mix -- mix 3 32-bit values reversibly.
+ * This is reversible, so any information in (a,b,c) before mix() is
+ * still in (a,b,c) after mix().
+ *
+ * If four pairs of (a,b,c) inputs are run through mix(), or through
+ * mix() in reverse, there are at least 32 bits of the output that
+ * are sometimes the same for one pair and different for another pair.
+ *
+ * This was tested for:
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+ * satisfy this are
+ * 4 6 8 16 19 4
+ * 9 15 3 18 27 15
+ * 14 9 3 7 17 3
+ * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for
+ * "differ" defined as + with a one-bit base and a two-bit delta. I
+ * used http://burtleburtle.net/bob/hash/avalanche.html to choose
+ * the operations, constants, and arrangements of the variables.
+ *
+ * This does not achieve avalanche. There are input bits of (a,b,c)
+ * that fail to affect some output bits of (a,b,c), especially of a.
+ * The most thoroughly mixed value is c, but it doesn't really even
+ * achieve avalanche in c.
+ *
+ * This allows some parallelism. Read-after-writes are good at doubling
+ * the number of bits affected, so the goal of mixing pulls in the
+ * opposite direction as the goal of parallelism. I did what I could.
+ * Rotates seem to cost as much as shifts on every machine I could lay
+ * my hands on, and rotates are much kinder to the top and bottom bits,
+ * so I used rotates.
+ *
+ * #define mix(a,b,c) \
+ * { \
+ * a -= c; a ^= rot(c, 4); c += b; \
+ * b -= a; b ^= rot(a, 6); a += c; \
+ * c -= b; c ^= rot(b, 8); b += a; \
+ * a -= c; a ^= rot(c,16); c += b; \
+ * b -= a; b ^= rot(a,19); a += c; \
+ * c -= b; c ^= rot(b, 4); b += a; \
+ * }
+ *
+ * mix(a,b,c);
+ */
+ a = (a - c) & INT_MASK; a ^= rot(c, 4); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a, 6); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 8); b = (b + a) & INT_MASK;
+ a = (a - c) & INT_MASK; a ^= rot(c,16); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a,19); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 4); b = (b + a) & INT_MASK;
+ }
+
+ //-------------------------------- last block: affect all 32 bits of (c)
+ switch (length) { // all the case statements fall through
+ case 12:
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 11:
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 10:
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 9:
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ case 8:
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 7:
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 6:
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 5:
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ case 4:
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 3:
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 2:
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 1:
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ break;
+ case 0:
+ return (int)(c & INT_MASK);
+ }
+ /*
+ * final -- final mixing of 3 32-bit values (a,b,c) into c
+ *
+ * Pairs of (a,b,c) values differing in only a few bits will usually
+ * produce values of c that look totally different. This was tested for
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ *
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ *
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * These constants passed:
+ * 14 11 25 16 4 14 24
+ * 12 14 25 16 4 14 24
+ * and these came close:
+ * 4 8 15 26 3 22 24
+ * 10 8 15 26 3 22 24
+ * 11 8 15 26 3 22 24
+ *
+ * #define final(a,b,c) \
+ * {
+ * c ^= b; c -= rot(b,14); \
+ * a ^= c; a -= rot(c,11); \
+ * b ^= a; b -= rot(a,25); \
+ * c ^= b; c -= rot(b,16); \
+ * a ^= c; a -= rot(c,4); \
+ * b ^= a; b -= rot(a,14); \
+ * c ^= b; c -= rot(b,24); \
+ * }
+ *
+ */
+ c ^= b; c = (c - rot(b,14)) & INT_MASK;
+ a ^= c; a = (a - rot(c,11)) & INT_MASK;
+ b ^= a; b = (b - rot(a,25)) & INT_MASK;
+ c ^= b; c = (c - rot(b,16)) & INT_MASK;
+ a ^= c; a = (a - rot(c,4)) & INT_MASK;
+ b ^= a; b = (b - rot(a,14)) & INT_MASK;
+ c ^= b; c = (c - rot(b,24)) & INT_MASK;
+
+ return (int)(c & INT_MASK);
+ }
+
+ private static long rot(long val, int pos) {
+ return ((Integer.rotateLeft(
+ (int)(val & INT_MASK), pos)) & INT_MASK);
+ }
+}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetDouble.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ * column IN ( <list-of-values )
+ *
+ * For double, we simply layer over the implementation for long. Double.doubleToRawLongBits
+ * is used to convert a 64-bit double to a 64-bit long with bit-for-bit fidelity.
+ */
+public class CuckooSetDouble {
+ CuckooSetLong setLong;
+
+ public CuckooSetDouble(int expectedSize) {
+ setLong = new CuckooSetLong(expectedSize);
+ }
+
+ /**
+ * Return true if and only if the value x is present in the set.
+ */
+ public boolean lookup(double x) {
+ return setLong.lookup(Double.doubleToRawLongBits(x));
+ }
+
+ /**
+ * Insert a single value into the set.
+ */
+ public void insert(double x) {
+ setLong.insert(Double.doubleToRawLongBits(x));
+ }
+
+ /**
+ * Insert all values in the input array into the set.
+ */
+ public void load(double[] a) {
+ for (Double x : a) {
+ insert(x);
+ }
+ }
+}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/CuckooSetLong.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * A high-performance set implementation used to support fast set membership testing,
+ * using Cuckoo hashing. This is used to support fast tests of the form
+ *
+ * column IN ( <list-of-values )
+ *
+ * For details on the algorithm, see R. Pagh and F. F. Rodler, "Cuckoo Hashing,"
+ * Elsevier Science preprint, Dec. 2003. http://www.itu.dk/people/pagh/papers/cuckoo-jour.pdf.
+ *
+ */
+public class CuckooSetLong {
+ private long t1[];
+ private long t2[];
+ private long prev1[] = null; // used for rehashing to get last set of values
+ private long prev2[] = null; // " "
+ private int n; // current array size
+ private static final double PADDING_FACTOR = 1.0/0.40; // have minimum 40% fill factor
+ private int salt[] = new int[6];
+ private Random gen = new Random(676983475);
+ private long blank = Long.MIN_VALUE;
+ private int rehashCount = 0;
+
+ // some prime numbers spaced about at powers of 2 in magnitude
+ public static int primes[] = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 1019, 2089,
+ 4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 4074073,
+ 8321801, 15485863, 32452867, 67867967, 122949829, 256203221, 553105253, 982451653,
+ 1645333507, 2147483647};
+
+ /**
+ * Allocate a new set to hold expectedSize values. Re-allocation to expand
+ * the set is not implemented, so the expected size must be at least the
+ * size of the set to be inserteed.
+ * @param expectedSize At least the size of the set of values that will be inserted.
+ */
+ public CuckooSetLong(int expectedSize) {
+
+ // Choose array size. We have two hash tables to hold entries, so the sum
+ // of the two should have a bit more than twice as much space as the
+ // minimum required.
+ n = (int) (expectedSize * PADDING_FACTOR / 2.0);
+
+ // try to get prime number table size to have less dependence on good hash function
+ for (int i = 0; i != primes.length; i++) {
+ if (n <= primes[i]) {
+ n = primes[i];
+ break;
+ }
+ }
+
+ t1 = new long[n];
+ t2 = new long[n];
+ Arrays.fill(t1, blank);
+ Arrays.fill(t2, blank);
+ updateHashSalt();
+ }
+
+ /**
+ * Return true if and only if the value x is present in the set.
+ */
+ public boolean lookup(long x) {
+
+ /* Must check that x is not blank because otherwise you could
+ * get a false positive if the blank value was a value you
+ * were legitimately testing to see if it was in the set.
+ */
+ return x != blank && (t1[h1(x)] == x || t2[h2(x)] == x);
+ }
+
+ public void insert(long x) {
+ if (x == blank) {
+ findNewBlank();
+ }
+ long temp;
+ if (lookup(x)) {
+ return;
+ }
+
+ // Try to insert up to n times. Rehash if that fails.
+ for(int i = 0; i != n; i++) {
+ if (t1[h1(x)] == blank) {
+ t1[h1(x)] = x;
+ return;
+ }
+
+ // swap x and t1[h1(x)]
+ temp = t1[h1(x)];
+ t1[h1(x)] = x;
+ x = temp;
+
+ if (t2[h2(x)] == blank) {
+ t2[h2(x)] = x;
+ return;
+ }
+
+ // swap x and t2[h2(x)]
+ temp = t2[h2(x)];
+ t2[h2(x)] = x;
+ x = temp;
+ }
+ rehash();
+ insert(x);
+ }
+
+ /**
+ * Insert all values in the input array into the set.
+ */
+ public void load(long[] a) {
+ for (Long x : a) {
+ insert(x);
+ }
+ }
+
+ /**
+ * Need to change current blank value to something else because it is in
+ * the input data set.
+ */
+ private void findNewBlank() {
+ long newBlank = gen.nextLong();
+ while(newBlank == blank || lookup(newBlank)) {
+ newBlank = gen.nextLong();
+ }
+
+ // replace existing blanks with new blanks
+ for(int i = 0; i != n; i++) {
+ if (t1[i] == blank) {
+ t1[i] = newBlank;
+ }
+ if (t2[i] == blank) {
+ t2[i] = newBlank;
+ }
+ }
+ blank = newBlank;
+ }
+
+ /**
+ * Try to insert with up to n value's "poked out". Return the last value poked out.
+ * If the value is not blank then we assume there was a cycle.
+ * Don't try to insert the same value twice. This is for use in rehash only,
+ * so you won't see the same value twice.
+ */
+ private long tryInsert(long x) {
+ long temp;
+
+ for(int i = 0; i != n; i++) {
+ if (t1[h1(x)] == blank) {
+ t1[h1(x)] = x;
+ return blank;
+ }
+
+ // swap x and t1[h1(x)]
+ temp = t1[h1(x)];
+ t1[h1(x)] = x;
+ x = temp;
+
+ if (t2[h2(x)] == blank) {
+ t2[h2(x)] = x;
+ return blank;
+ }
+
+ // swap x and t2[h2(x)]
+ temp = t2[h2(x)];
+ t2[h2(x)] = x;
+ x = temp;
+ if (x == blank) {
+ break;
+ }
+ }
+ return x;
+ }
+
+
+
+ /**
+ * Variation of Robert Jenkins' hash function.
+ */
+ private int h1(long y) {
+ int x = (int) ((((y >>> 32) ^ y)) & 0xFFFFFFFF);
+ x = (x + salt[0]) + (x << 12);
+ x = (x ^ salt[1]) ^ (x >> 19);
+ x = (x + salt[2]) + (x << 5);
+ x = (x + salt[3]) ^ (x << 9);
+ x = (x + salt[4]) + (x << 3);
+ x = (x ^ salt[5]) ^ (x >> 16);
+
+ // Return value modulo n but always in the positive range (0..n-1).
+ // And with the mask to zero the sign bit to make the input to mod positive
+ // so the output will definitely be positive.
+ return (x & 0x7FFFFFFF) % n;
+ }
+
+ /**
+ * basic modular hash function
+ */
+ private int h2(long x) {
+
+ // Return value modulo n but always in the positive range (0..n-1).
+ // Since n is prime, this gives good spread for numbers that are multiples
+ // of one billion, which is important since timestamps internally
+ // are stored as a number of nanoseconds, and the fractional seconds
+ // part is often 0.
+ return (((int) (x % n)) + n) % n;
+ }
+
+ /**
+ * In case of rehash, hash function h2 is changed by updating the
+ * entries in the salt array with new random values.
+ */
+ private void updateHashSalt() {
+ for (int i = 0; i != 6; i++) {
+ salt[i] = gen.nextInt(0x7FFFFFFF);
+ }
+ }
+
+ private void rehash() {
+ rehashCount++;
+ if (rehashCount > 20) {
+ throw new RuntimeException("Too many rehashes");
+ }
+ updateHashSalt();
+
+ // Save original values
+ if (prev1 == null) {
+ prev1 = t1;
+ prev1 = t2;
+ }
+ t1 = new long[n];
+ t2 = new long[n];
+ Arrays.fill(t1, blank);
+ Arrays.fill(t2, blank);
+ for (Long v : prev1) {
+ if (v != blank) {
+ long x = tryInsert(v);
+ if (x != blank) {
+ rehash();
+ return;
+ }
+ }
+ }
+ for (Long v : prev2) {
+ if (v != blank) {
+ long x = tryInsert(v);
+ if (x != blank) {
+ rehash();
+ return;
+ }
+ }
+ }
+
+ // We succeeded in adding all the values, so
+ // clear the previous values recorded.
+ prev1 = null;
+ prev2 = null;
+ }
+}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterDoubleColumnInList.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,180 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate IN filter on a batch for a vector of doubles.
+ */
+public class FilterDoubleColumnInList extends VectorExpression {
+ private static final long serialVersionUID = 1L;
+ private int inputCol;
+ private double[] inListValues;
+
+ // The set object containing the IN list. This is optimized for lookup
+ // of the data type of the column.
+ private transient CuckooSetDouble inSet;
+
+ public FilterDoubleColumnInList() {
+ super();
+ inSet = null;
+ }
+
+ /**
+ * After construction you must call setInListValues() to add the values to the IN set.
+ */
+ public FilterDoubleColumnInList(int colNum) {
+ this.inputCol = colNum;
+ inSet = null;
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ if (inSet == null) {
+ inSet = new CuckooSetDouble(inListValues.length);
+ inSet.load(inListValues);
+ }
+
+ DoubleColumnVector inputColVector = (DoubleColumnVector) batch.cols[inputCol];
+ int[] sel = batch.selected;
+ boolean[] nullPos = inputColVector.isNull;
+ int n = batch.size;
+ double[] vector = inputColVector.vector;
+
+ // return immediately if batch is empty
+ if (n == 0) {
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero
+ // Repeating property will not change.
+
+ if (!(inSet.lookup(vector[0]))) {
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ } else {
+ if (inputColVector.isRepeating) {
+ //All must be selected otherwise size would be zero
+ //Repeating property will not change.
+ if (!nullPos[0]) {
+ if (!inSet.lookup(vector[0])) {
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else {
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ // Change the selected vector
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ }
+ }
+
+
+ @Override
+ public String getOutputType() {
+ return "boolean";
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return -1;
+ }
+
+ @Override
+ public Descriptor getDescriptor() {
+
+ // This VectorExpression (IN) is a special case, so don't return a descriptor.
+ return null;
+ }
+
+ public double[] getInListValues() {
+ return this.inListValues;
+ }
+
+ public void setInListValues(double [] a) {
+ this.inListValues = a;
+ }
+}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterLongColumnInList.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate IN filter on a batch for a vector of longs.
+ */
+public class FilterLongColumnInList extends VectorExpression {
+ private static final long serialVersionUID = 1L;
+ private int inputCol;
+ private long[] inListValues;
+
+ // The set object containing the IN list. This is optimized for lookup
+ // of the data type of the column.
+ private transient CuckooSetLong inSet;
+
+ public FilterLongColumnInList() {
+ super();
+ inSet = null;
+ }
+
+ /**
+ * After construction you must call setInListValues() to add the values to the IN set.
+ */
+ public FilterLongColumnInList(int colNum) {
+ this.inputCol = colNum;
+ inSet = null;
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ if (inSet == null) {
+ inSet = new CuckooSetLong(inListValues.length);
+ inSet.load(inListValues);
+ }
+
+ LongColumnVector inputColVector = (LongColumnVector) batch.cols[inputCol];
+ int[] sel = batch.selected;
+ boolean[] nullPos = inputColVector.isNull;
+ int n = batch.size;
+ long[] vector = inputColVector.vector;
+
+ // return immediately if batch is empty
+ if (n == 0) {
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero
+ // Repeating property will not change.
+
+ if (!(inSet.lookup(vector[0]))) {
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ } else {
+ if (inputColVector.isRepeating) {
+ //All must be selected otherwise size would be zero
+ //Repeating property will not change.
+ if (!nullPos[0]) {
+ if (!inSet.lookup(vector[0])) {
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else {
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ // Change the selected vector
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ }
+ }
+
+
+ @Override
+ public String getOutputType() {
+ return "boolean";
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return -1;
+ }
+
+ @Override
+ public Descriptor getDescriptor() {
+
+ // This VectorExpression (IN) is a special case, so don't return a descriptor.
+ return null;
+ }
+
+ public long[] getInListValues() {
+ return this.inListValues;
+ }
+
+ public void setInListValues(long [] a) {
+ this.inListValues = a;
+ }
+}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColumnInList.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate an IN filter on a batch for a vector of strings.
+ * This is optimized so that no objects have to be created in
+ * the inner loop, and there is a hash table implemented
+ * with Cuckoo hashing that has fast lookup to do the IN test.
+ */
+public class FilterStringColumnInList extends VectorExpression {
+ private static final long serialVersionUID = 1L;
+ private int inputCol;
+ private byte[][] inListValues;
+
+ // The set object containing the IN list. This is optimized for lookup
+ // of the data type of the column.
+ private transient CuckooSetBytes inSet;
+
+ public FilterStringColumnInList() {
+ super();
+ inSet = null;
+ }
+
+ /**
+ * After construction you must call setInListValues() to add the values to the IN set.
+ */
+ public FilterStringColumnInList(int colNum) {
+ this.inputCol = colNum;
+ inSet = null;
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ if (inSet == null) {
+ inSet = new CuckooSetBytes(inListValues.length);
+ inSet.load(inListValues);
+ }
+
+ BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[inputCol];
+ int[] sel = batch.selected;
+ boolean[] nullPos = inputColVector.isNull;
+ int n = batch.size;
+ byte[][] vector = inputColVector.vector;
+ int[] start = inputColVector.start;
+ int[] len = inputColVector.length;
+
+ // return immediately if batch is empty
+ if (n == 0) {
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero
+ // Repeating property will not change.
+ if (!(inSet.lookup(vector[0], start[0], len[0]))) {
+
+ // Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (inSet.lookup(vector[i], start[i], len[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (inSet.lookup(vector[i], start[i], len[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ } else {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero
+ // Repeating property will not change.
+ if (!nullPos[0]) {
+ if (!inSet.lookup(vector[0], start[0], len[0])) {
+
+ // Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else {
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i], start[i], len[i] )) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ // Change the selected vector
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for(int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (inSet.lookup(vector[i], start[i], len[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ }
+ }
+
+
+ @Override
+ public String getOutputType() {
+ return "boolean";
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return -1;
+ }
+
+ @Override
+ public Descriptor getDescriptor() {
+
+ // This VectorExpression (IN) is a special case, so don't return a descriptor.
+ return null;
+ }
+
+ public byte[][] getInListValues() {
+ return this.inListValues;
+ }
+
+ public void setInListValues(byte [][] a) {
+ this.inListValues = a;
+ }
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Wed Nov 6 16:43:36 2013
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFConcat;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
@@ -172,6 +173,7 @@ public class Vectorizer implements Physi
supportedGenericUDFs.add(GenericUDFConcat.class);
supportedGenericUDFs.add(GenericUDFAbs.class);
supportedGenericUDFs.add(GenericUDFBetween.class);
+ supportedGenericUDFs.add(GenericUDFIn.class);
// For type casts
supportedGenericUDFs.add(UDFToLong.class);
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizationContext.java Wed Nov 6 16:43:36 2013
@@ -92,6 +92,7 @@ import org.apache.hadoop.hive.ql.udf.UDF
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLower;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
@@ -939,4 +940,42 @@ public class TestVectorizationContext {
ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
assertTrue(ve instanceof FilterDoubleColumnNotBetween);
}
+
+ @Test
+ public void testInFilters() throws HiveException {
+ ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(String.class, "col1", "table", false);
+ ExprNodeConstantDesc constDesc = new ExprNodeConstantDesc("Alpha");
+ ExprNodeConstantDesc constDesc2 = new ExprNodeConstantDesc("Bravo");
+
+ // string IN
+ GenericUDFIn udf = new GenericUDFIn();
+ ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc();
+ exprDesc.setGenericUDF(udf);
+ List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>();
+ children1.add(col1Expr);
+ children1.add(constDesc);
+ children1.add(constDesc2);
+ exprDesc.setChildren(children1);
+
+ Map<String, Integer> columnMap = new HashMap<String, Integer>();
+ columnMap.put("col1", 1);
+ columnMap.put("col2", 2);
+ VectorizationContext vc = new VectorizationContext(columnMap, 2);
+ VectorExpression ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+ assertTrue(ve instanceof FilterStringColumnInList);
+
+ // long IN
+ children1.set(0, new ExprNodeColumnDesc(Long.class, "col1", "table", false));
+ children1.set(1, new ExprNodeConstantDesc(10));
+ children1.set(2, new ExprNodeConstantDesc(20));
+ ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+ assertTrue(ve instanceof FilterLongColumnInList);
+
+ // double IN
+ children1.set(0, new ExprNodeColumnDesc(Double.class, "col1", "table", false));
+ children1.set(1, new ExprNodeConstantDesc(10d));
+ children1.set(2, new ExprNodeConstantDesc(20d));
+ ve = vc.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.FILTER);
+ assertTrue(ve instanceof FilterDoubleColumnInList);
+ }
}
Added: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java?rev=1539392&view=auto
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java (added)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestCuckooSet.java Wed Nov 6 16:43:36 2013
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import static org.junit.Assert.*;
+
+import java.util.Random;
+
+import org.junit.Test;
+
+public class TestCuckooSet {
+
+ // maximum table size
+ private static int MAX_SIZE = 65437;
+
+ @Test
+ public void testSetLong () {
+
+ // Set of values to look for. Include the original blank value Long.MIN_VALUE to make sure
+ // the process of choosing a new blank works.
+ Long[] values = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L,
+ 22L, 23L, 24L, 25L, -26L, 27L,
+ 28L, 29L, 30L, 111111111111111L, -444444444444444L, Long.MIN_VALUE};
+ Long[] negatives = {0L, 4L, 4000L, -2L, 19L, 222222222222222L, -333333333333333L};
+ CuckooSetLong s = new CuckooSetLong(values.length);
+ for(Long v : values) {
+ s.insert(v);
+ }
+
+ // test that the values we added are there
+ for(Long v : values) {
+ assertTrue(s.lookup(v));
+ }
+
+ // test that values that we know are missing are shown to be absent
+ for (Long v : negatives) {
+ assertFalse(s.lookup(v));
+ }
+
+ // Set of values to look for.
+ Long[] values2 = {1L, 2L, 3L, 1000L, 2000L, 3000L, 8L, 8L, 9L, 13L, 17L, 22L, 23L,
+ 24L, 25L, -26L, 27L,
+ 28L, 29L, 30L, 111111111111111L, -444444444444444L};
+
+ // Include the original blank value Long.MIN_VALUE in the negatives to make sure we get
+ // the correct result that the blank value is not there.
+ Long[] negatives2 = {0L, 4L, 4000L, -2L, 19L, 222222222222222L,
+ -333333333333333L, Long.MIN_VALUE};
+ s = new CuckooSetLong(values2.length);
+ for(Long v : values2) {
+ s.insert(v);
+ }
+
+ // test that the values we added are there
+ for(Long v : values2) {
+ assertTrue(s.lookup(v));
+ }
+
+ // test that values that we know are missing are shown to be absent
+ for (Long v : negatives2) {
+ assertFalse(s.lookup(v));
+ }
+
+ }
+
+ // load multiple random sets of Long values
+ @Test
+ public void testSetLongRandom() {
+ long[] values;
+ Random gen = new Random(98763537);
+ for(int i = 0; i < 200;) {
+
+ // Make a random array of longs
+ int size = gen.nextInt() % MAX_SIZE;
+ if (size <= 0) { // ensure size is >= 1, otherwise try again
+ continue;
+ }
+ i++;
+ values = new long[size];
+ loadRandom(values, gen);
+
+ // load them into a SetLong
+ CuckooSetLong s = new CuckooSetLong(size);
+ loadSet(s, values);
+
+ // look them up to make sure they are all there
+ for (int j = 0; j != size; j++) {
+ assertTrue(s.lookup(values[j]));
+ }
+ }
+ }
+
+ @Test
+ public void testSetDouble() {
+
+ // Set of values to look for.
+ Double[] values = {7021.0D, 5780.0D, 0D, -1D, 1.999e50D};
+ Double[] negatives = {7000.0D, -2D, 1.9999e50D};
+ CuckooSetDouble s = new CuckooSetDouble(values.length);
+ for(Double v : values) {
+ s.insert(v);
+ }
+
+ // test that the values we added are there
+ for(Double v : values) {
+ assertTrue(s.lookup(v));
+ }
+
+ // test that values that we know are missing are shown to be absent
+ for (Double v : negatives) {
+ assertFalse(s.lookup(v));
+ }
+ }
+
+ @Test
+ public void testSetBytes() {
+ String[] strings = {"foo", "bar", "baz", "a", "", "x1341", "Z"};
+ String[] negativeStrings = {"not", "in", "the", "set", "foobar"};
+ byte[][] values = getByteArrays(strings);
+ byte[][] negatives = getByteArrays(negativeStrings);
+
+ // load set
+ CuckooSetBytes s = new CuckooSetBytes(strings.length);
+ for(byte[] v : values) {
+ s.insert(v);
+ }
+
+ // test that the values we added are there
+ for(byte[] v : values) {
+ assertTrue(s.lookup(v, 0, v.length));
+ }
+
+ // test that values that we know are missing are shown to be absent
+ for (byte[] v : negatives) {
+ assertFalse(s.lookup(v, 0, v.length));
+ }
+
+ // Test that we can search correctly using a buffer and pulling
+ // a sequence of bytes out of the middle of it. In this case it
+ // is the 3 letter sequence "foo".
+ byte[] buf = getUTF8Bytes("thewordfooisinhere");
+ assertTrue(s.lookup(buf, 7, 3));
+ }
+
+ @Test
+ public void testSetBytesLargeRandom() {
+ byte[][] values;
+ Random gen = new Random(98763537);
+ for(int i = 0; i < 200;) {
+
+ // Make a random array of byte arrays
+ int size = gen.nextInt() % MAX_SIZE;
+ if (size <= 0) { // ensure size is >= 1, otherwise try again
+ continue;
+ }
+ i++;
+ values = new byte[size][];
+ loadRandomBytes(values, gen);
+
+ // load them into a set
+ CuckooSetBytes s = new CuckooSetBytes(size);
+ loadSet(s, values);
+
+ // look them up to make sure they are all there
+ for (int j = 0; j != size; j++) {
+ assertTrue(s.lookup(values[j], 0, values[j].length));
+ }
+ }
+ }
+
+ public void loadRandomBytes(byte[][] values, Random gen) {
+ for (int i = 0; i != values.length; i++) {
+ values[i] = getUTF8Bytes(Integer.toString(gen.nextInt()));
+ }
+ }
+
+ private byte[] getUTF8Bytes(String s) {
+ byte[] v = null;
+ try {
+ v = s.getBytes("UTF-8");
+ } catch (Exception e) {
+ ; // won't happen
+ }
+ return v;
+ }
+
+ // Get an array of UTF-8 byte arrays from an array of strings
+ private byte[][] getByteArrays(String[] strings) {
+ byte[][] values = new byte[strings.length][];
+ for(int i = 0; i != strings.length; i++) {
+ try {
+ values[i] = strings[i].getBytes("UTF-8");
+ } catch (Exception e) {
+ ; // can't happen
+ }
+ }
+ return values;
+ }
+
+ private void loadSet(CuckooSetLong s, long[] values) {
+ for (Long v : values) {
+ s.insert(v);
+ }
+ }
+
+ private void loadSet(CuckooSetBytes s, byte[][] values) {
+ for (byte[] v: values) {
+ s.insert(v);
+ }
+ }
+
+ private void loadRandom(long[] a, Random gen) {
+ int size = a.length;
+ for(int i = 0; i != size; i++) {
+ a[i] = gen.nextLong();
+ }
+ }
+}
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java?rev=1539392&r1=1539391&r2=1539392&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorFilterExpressions.java Wed Nov 6 16:43:36 2013
@@ -543,4 +543,189 @@ public class TestVectorFilterExpressions
assertTrue(vrb.selectedInUse);
assertEquals(0, vrb.selected[0]);
}
+
+ /**
+ * Test the IN filter VectorExpression classes.
+ */
+
+ @Test
+ public void testFilterLongIn() {
+ int seed = 17;
+ VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 5, 2, seed);
+ LongColumnVector lcv0 = (LongColumnVector) vrb.cols[0];
+ long[] inList = {5, 20};
+ FilterLongColumnInList f = new FilterLongColumnInList(0);
+ f.setInListValues(inList);
+ VectorExpression expr1 = f;
+
+ // Basic case
+ lcv0.vector[0] = 5;
+ lcv0.vector[1] = 20;
+ lcv0.vector[2] = 17;
+ lcv0.vector[3] = 15;
+ lcv0.vector[4] = 10;
+
+ expr1.evaluate(vrb);
+
+ assertEquals(2, vrb.size);
+ assertTrue(vrb.selectedInUse);
+ assertEquals(0, vrb.selected[0]);
+ assertEquals(1, vrb.selected[1]);
+
+ // With nulls
+ VectorizedRowBatch vrb1 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 5, 2, seed);
+
+ lcv0 = (LongColumnVector) vrb1.cols[0];
+
+ lcv0.vector[0] = 5;
+ lcv0.vector[1] = 20;
+ lcv0.vector[2] = 17;
+ lcv0.vector[3] = 15;
+ lcv0.vector[4] = 10;
+
+ lcv0.noNulls = false;
+ lcv0.isNull[0] = true;
+ lcv0.isNull[2] = true;
+
+ expr1.evaluate(vrb1);
+ assertEquals(1, vrb1.size);
+ assertTrue(vrb1.selectedInUse);
+ assertEquals(1, vrb1.selected[0]);
+
+ // With nulls and selected
+ VectorizedRowBatch vrb2 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 7, 2, seed);
+ vrb2.selectedInUse = true;
+ vrb2.selected[0] = 1;
+ vrb2.selected[1] = 2;
+ vrb2.selected[2] = 4;
+ vrb2.size = 3;
+
+ lcv0 = (LongColumnVector) vrb2.cols[0];
+
+ lcv0.vector[0] = 5;
+ lcv0.vector[1] = 20;
+ lcv0.vector[2] = 17;
+ lcv0.vector[3] = 15;
+ lcv0.vector[4] = 10;
+ lcv0.vector[5] = 19;
+ lcv0.vector[6] = 21;
+
+ lcv0.noNulls = false;
+ lcv0.isNull[0] = true;
+ lcv0.isNull[2] = true;
+ lcv0.isNull[5] = true;
+
+ expr1.evaluate(vrb2);
+ assertEquals(1, vrb2.size);
+ assertEquals(1, vrb2.selected[0]);
+
+ // Repeating non null
+ VectorizedRowBatch vrb3 = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 7, 2, seed);
+ lcv0 = (LongColumnVector) vrb3.cols[0];
+
+ lcv0.isRepeating = true;
+ lcv0.vector[0] = 5;
+ lcv0.vector[1] = 20;
+ lcv0.vector[2] = 17;
+ lcv0.vector[3] = 15;
+ lcv0.vector[4] = 10;
+
+ expr1.evaluate(vrb3);
+ assertEquals(7, vrb3.size);
+ assertFalse(vrb3.selectedInUse);
+ assertTrue(lcv0.isRepeating);
+
+ // Repeating null
+ lcv0.noNulls = false;
+ lcv0.vector[0] = 5;
+ lcv0.isNull[0] = true;
+
+ expr1.evaluate(vrb3);
+ assertEquals(0, vrb3.size);
+ }
+
+ @Test
+ public void testFilterDoubleIn() {
+ int seed = 17;
+ VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 5, 2, seed);
+ DoubleColumnVector dcv0 = new DoubleColumnVector();
+ vrb.cols[0] = dcv0;
+ double[] inList = {5.0, 20.2};
+ FilterDoubleColumnInList f = new FilterDoubleColumnInList(0);
+ f.setInListValues(inList);
+ VectorExpression expr1 = f;
+
+ // Basic sanity check. Other cases are not skipped because it is similar to the case for Long.
+ dcv0.vector[0] = 5.0;
+ dcv0.vector[1] = 20.2;
+ dcv0.vector[2] = 17.0;
+ dcv0.vector[3] = 15.0;
+ dcv0.vector[4] = 10.0;
+
+ expr1.evaluate(vrb);
+
+ assertEquals(2, vrb.size);
+ assertTrue(vrb.selectedInUse);
+ assertEquals(0, vrb.selected[0]);
+ assertEquals(1, vrb.selected[1]);
+ }
+
+ @Test
+ public void testFilterStringIn() {
+ int seed = 17;
+ VectorizedRowBatch vrb = VectorizedRowGroupGenUtil.getVectorizedRowBatch(
+ 3, 2, seed);
+ vrb.cols[0] = new BytesColumnVector();
+ BytesColumnVector bcv = (BytesColumnVector) vrb.cols[0];
+
+ bcv.initBuffer();
+ bcv.setVal(0, a, 0, 1);
+ bcv.setVal(1, b, 0, 1);
+ bcv.setVal(2, c, 0, 1);
+
+ VectorExpression expr = new FilterStringColumnInList(0);
+ byte[][] inList = {b, c};
+ ((FilterStringColumnInList) expr).setInListValues(inList);
+
+ // basic test
+ expr.evaluate(vrb);
+
+ assertEquals(2, vrb.size);
+ assertTrue(vrb.selectedInUse);
+ assertEquals(1, vrb.selected[0]);
+ assertEquals(2, vrb.selected[1]);
+
+ // nulls
+ vrb.selectedInUse = false;
+ vrb.size = 3;
+ bcv.noNulls = false;
+ bcv.isNull[2] = true;
+ expr.evaluate(vrb);
+ assertEquals(1, vrb.size);
+ assertEquals(1, vrb.selected[0]);
+ assertTrue(vrb.selectedInUse);
+
+ // repeating
+ vrb.selectedInUse = false;
+ vrb.size = 3;
+ bcv.noNulls = true;
+ bcv.isRepeating = true;
+ expr.evaluate(vrb);
+ assertEquals(0, vrb.size);
+
+ // nulls and repeating
+ vrb.selectedInUse = false;
+ vrb.size = 3;
+ bcv.noNulls = false;
+ bcv.isRepeating = true;
+ bcv.isNull[0] = true;
+ bcv.setVal(0, b, 0, 1);
+ expr.evaluate(vrb);
+ assertEquals(0, vrb.size);
+ }
}