You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by ji...@apache.org on 2015/10/29 05:44:58 UTC
[07/15] incubator-asterixdb git commit: ASTERIXDB-1102: VarSize
Encoding to store length of String and ByteArray
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
deleted file mode 100644
index 6472b68..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8NGramTokenFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
-
- private static final long serialVersionUID = 1L;
-
- public HashedUTF8NGramTokenFactory() {
- super();
- }
-
- public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public IToken createToken() {
- return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
deleted file mode 100644
index 6911b25..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordToken.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class HashedUTF8WordToken extends UTF8WordToken {
-
- private int hash = 0;
-
- public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == null) {
- return false;
- }
- if (!(o instanceof IToken)) {
- return false;
- }
- IToken t = (IToken) o;
- if (t.getTokenLength() != tokenLength) {
- return false;
- }
- int offset = 0;
- for (int i = 0; i < tokenLength; i++) {
- if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils.charAt(data, start + offset)) {
- return false;
- }
- offset += StringUtils.charSize(data, start + offset);
- }
- return true;
- }
-
- @Override
- public int hashCode() {
- return hash;
- }
-
- @Override
- public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
- super.reset(data, start, length, tokenLength, tokenCount);
-
- // pre-compute hash value using JAQL-like string hashing
- int pos = start;
- hash = GOLDEN_RATIO_32;
- for (int i = 0; i < tokenLength; i++) {
- hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- hash *= GOLDEN_RATIO_32;
- pos += StringUtils.charSize(data, pos);
- }
- hash += tokenCount;
- }
-
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- if (tokenTypeTag > 0) {
- dos.write(tokenTypeTag);
- }
-
- // serialize hash value
- dos.writeInt(hash);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
deleted file mode 100644
index 50bc67c..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/HashedUTF8WordTokenFactory.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
-
- private static final long serialVersionUID = 1L;
-
- public HashedUTF8WordTokenFactory() {
- super();
- }
-
- public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public IToken createToken() {
- return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
deleted file mode 100644
index 86359e1..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface IBinaryTokenizer {
- public IToken getToken();
-
- public boolean hasNext();
-
- public void next();
-
- public void reset(byte[] data, int start, int length);
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
deleted file mode 100644
index f7cf4d5..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public interface IBinaryTokenizerFactory extends Serializable {
- public IBinaryTokenizer createTokenizer();
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
deleted file mode 100644
index 81f7b44..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/INGramToken.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface INGramToken {
- public int getNumPostChars();
-
- public int getNumPreChars();
-
- public void setNumPrePostChars(int numPreChars, int numPostChars);
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
deleted file mode 100644
index 6d7b05d..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/IToken.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public interface IToken {
- public byte[] getData();
-
- public int getLength();
-
- public int getStart();
-
- public int getTokenLength();
-
- public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount);
-
- public void serializeToken(DataOutput dos) throws IOException;
-
- public void serializeTokenCount(DataOutput dos) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
deleted file mode 100644
index 245530f..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/ITokenFactory.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public interface ITokenFactory extends Serializable {
- public IToken createToken();
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 88c58b2..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
-
- private int gramLength;
- private boolean usePrePost;
-
- private int gramNum;
- private int totalGrams;
-
- private final INGramToken concreteToken;
-
- public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
- boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
- super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
- this.gramLength = gramLength;
- this.usePrePost = usePrePost;
- concreteToken = (INGramToken) token;
- }
-
- @Override
- public boolean hasNext() {
- if (gramNum < totalGrams) {
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void next() {
- int currentTokenStart = index;
- int tokenCount = 1;
- int numPreChars = 0;
- int numPostChars = 0;
- if (usePrePost) {
- numPreChars = Math.max(gramLength - gramNum - 1, 0);
- numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
- }
- gramNum++;
-
- concreteToken.setNumPrePostChars(numPreChars, numPostChars);
- if (numPreChars == 0) {
- index += StringUtils.charSize(data, index);
- }
-
- // compute token count
- // ignore pre and post grams for duplicate detection
- if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
- int tmpIndex = start;
- while (tmpIndex < currentTokenStart) {
- tokenCount++; // assume found
- int offset = 0;
- for (int j = 0; j < gramLength; j++) {
- if (StringUtils.toLowerCase(StringUtils.charAt(data, currentTokenStart + offset)) != StringUtils
- .toLowerCase(StringUtils.charAt(data, tmpIndex + offset))) {
- tokenCount--;
- break;
- }
- offset += StringUtils.charSize(data, tmpIndex + offset);
- }
- tmpIndex += StringUtils.charSize(data, tmpIndex);
- }
- }
-
- // set token
- token.reset(data, currentTokenStart, length, gramLength, tokenCount);
- }
-
- @Override
- public void reset(byte[] data, int start, int length) {
- super.reset(data, start, length);
- gramNum = 0;
-
- int numChars = 0;
- int pos = index;
- int end = pos + utf8Length;
- while (pos < end) {
- numChars++;
- pos += StringUtils.charSize(data, pos);
- }
-
- if (usePrePost) {
- totalGrams = numChars + gramLength - 1;
- } else {
- totalGrams = numChars - gramLength + 1;
- }
- }
-
- public void setGramlength(int gramLength) {
- this.gramLength = gramLength;
- }
-
- public void setPrePost(boolean usePrePost) {
- this.usePrePost = usePrePost;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
deleted file mode 100644
index d3afd80..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/StringUtils.java
+++ /dev/null
@@ -1,216 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class StringUtils {
- public static char charAt(byte[] b, int s) {
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return (char) c;
-
- case 12:
- case 13:
- return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
-
- case 14:
- return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
-
- default:
- throw new IllegalArgumentException();
- }
- }
-
- public static int charSize(byte[] b, int s) {
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return 1;
-
- case 12:
- case 13:
- return 2;
-
- case 14:
- return 3;
- }
- throw new IllegalStateException();
- }
-
- public static int getModifiedUTF8Len(char c) {
- if (c >= 0x0000 && c <= 0x007F) {
- return 1;
- } else if (c <= 0x07FF) {
- return 2;
- } else {
- return 3;
- }
- }
-
- public static int getStrLen(byte[] b, int s) {
- int pos = s + 2;
- int end = pos + getUTFLen(b, s);
- int charCount = 0;
- while (pos < end) {
- charCount++;
- pos += charSize(b, pos);
- }
- return charCount;
- }
-
- public static int getUTFLen(byte[] b, int s) {
- return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
- }
-
- public static char toLowerCase(char c) {
- switch (c) {
- case 'A':
- return 'a';
- case 'B':
- return 'b';
- case 'C':
- return 'c';
- case 'D':
- return 'd';
- case 'E':
- return 'e';
- case 'F':
- return 'f';
- case 'G':
- return 'g';
- case 'H':
- return 'h';
- case 'I':
- return 'i';
- case 'J':
- return 'j';
- case 'K':
- return 'k';
- case 'L':
- return 'l';
- case 'M':
- return 'm';
- case 'N':
- return 'n';
- case 'O':
- return 'o';
- case 'P':
- return 'p';
- case 'Q':
- return 'q';
- case 'R':
- return 'r';
- case 'S':
- return 's';
- case 'T':
- return 't';
- case 'U':
- return 'u';
- case 'V':
- return 'v';
- case 'W':
- return 'w';
- case 'X':
- return 'x';
- case 'Y':
- return 'y';
- case 'Z':
- return 'z';
- case 'Ä':
- return 'ä';
- case 'Ǟ':
- return 'ǟ';
- case 'Ë':
- return 'ë';
- case 'Ḧ':
- return 'ḧ';
- case 'Ï':
- return 'ï';
- case 'Ḯ':
- return 'ḯ';
- case 'Ö':
- return 'ö';
- case 'Ȫ':
- return 'ȫ';
- case 'Ṏ':
- return 'ṏ';
- case 'Ü':
- return 'ü';
- case 'Ǖ':
- return 'ǖ';
- case 'Ǘ':
- return 'ǘ';
- case 'Ǚ':
- return 'ǚ';
- case 'Ǜ':
- return 'ǜ';
- case 'Ṳ':
- return 'ṳ';
- case 'Ṻ':
- return 'ṻ';
- case 'Ẅ':
- return 'ẅ';
- case 'Ẍ':
- return 'ẍ';
- case 'Ÿ':
- return 'ÿ';
- default:
- // since I probably missed some chars above
- // use Java to convert to lower case to be safe
- return Character.toLowerCase(c);
- }
- }
-
- public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
-
- if (c >= 0x0000 && c <= 0x007F) {
- dos.writeByte(c);
- } else if (c <= 0x07FF) {
- dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- } else {
- dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
- dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- }
- }
-
- public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
- dos.write((len >>> 8) & 0xFF);
- dos.write((len >>> 0) & 0xFF);
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
deleted file mode 100644
index a3326c4..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramToken.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
-
- public final static char PRECHAR = '#';
-
- public final static char POSTCHAR = '$';
-
- protected int numPreChars;
- protected int numPostChars;
-
- public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public int getNumPostChars() {
- return numPreChars;
- }
-
- @Override
- public int getNumPreChars() {
- return numPostChars;
- }
-
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
-
- // regular chars
- int numRegChars = tokenLength - numPreChars - numPostChars;
-
- // assuming pre and post char need 1-byte each in utf8
- int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
-
- // write utf8 length indicator
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
-
- // pre chars
- for (int i = 0; i < numPreChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
- }
-
- int pos = start;
- for (int i = 0; i < numRegChars; i++) {
- char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
- pos += StringUtils.charSize(data, pos);
- }
-
- // post chars
- for (int i = 0; i < numPostChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
- }
- }
-
- public void setNumPrePostChars(int numPreChars, int numPostChars) {
- this.numPreChars = numPreChars;
- this.numPostChars = numPostChars;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
deleted file mode 100644
index 520aa66..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8NGramTokenFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
-
- private static final long serialVersionUID = 1L;
-
- public UTF8NGramTokenFactory() {
- super();
- }
-
- public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public IToken createToken() {
- return new UTF8NGramToken(tokenTypeTag, countTypeTag);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
deleted file mode 100644
index 41a8105..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordToken.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-public class UTF8WordToken extends AbstractUTF8Token {
-
- public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
-
- int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
- int pos = start;
- for (int i = 0; i < tokenLength; i++) {
- char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
- pos += StringUtils.charSize(data, pos);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java b/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
deleted file mode 100644
index 9d15db9..0000000
--- a/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/UTF8WordTokenFactory.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
-
- private static final long serialVersionUID = 1L;
-
- public UTF8WordTokenFactory() {
- super();
- }
-
- public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
- super(tokenTypeTag, countTypeTag);
- }
-
- @Override
- public IToken createToken() {
- return new UTF8WordToken(tokenTypeTag, countTypeTag);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
deleted file mode 100644
index d10aefb..0000000
--- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/NGramTokenizerTest.java
+++ /dev/null
@@ -1,239 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tests;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
-import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8NGramTokenFactory;
-import org.apache.asterix.fuzzyjoin.tokenizer.IToken;
-import org.apache.asterix.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
-
-public class NGramTokenizerTest {
-
- private char PRECHAR = '#';
- private char POSTCHAR = '$';
-
- private String str = "Jürgen S. Generic's Car";
- private byte[] inputBuffer;
-
- private int gramLength = 3;
-
- private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
-
- String tmp = s.toLowerCase();
- if (prePost) {
- StringBuilder preBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- preBuilder.append(PRECHAR);
- }
- String pre = preBuilder.toString();
-
- StringBuilder postBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- postBuilder.append(POSTCHAR);
- }
- String post = postBuilder.toString();
-
- tmp = pre + s.toLowerCase() + post;
- }
-
- for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
- String gram = tmp.substring(i, i + gramLength);
- grams.add(gram);
- }
- }
-
- @Before
- public void init() throws Exception {
- // serialize string into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(str);
- inputBuffer = baos.toByteArray();
- }
-
- void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
- false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
- for (String s : expectedGrams) {
- Integer count = gramCounts.get(s);
- if (count == null) {
- count = 1;
- gramCounts.put(s, count);
- } else {
- count++;
- }
-
- int hash = tokenHash(s, count);
- expectedHashedGrams.add(hash);
- }
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- Integer hashedGram = in.readInt();
-
- // System.out.println(hashedGram);
-
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
-
- tokenCount++;
- }
- // System.out.println("---------");
- }
-
- void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
- tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- for (String s : expectedGrams) {
- int hash = tokenHash(s, 1);
- expectedHashedGrams.add(hash);
- }
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- Integer hashedGram = in.readInt();
-
- // System.out.println(hashedGram);
-
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
-
- tokenCount++;
- }
- // System.out.println("---------");
- }
-
- void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
- UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
- tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- String strGram = in.readUTF();
-
- // System.out.println("\"" + strGram + "\"");
-
- Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
-
- tokenCount++;
- }
- // System.out.println("---------");
- }
-
- @Test
- public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
- }
-
- @Test
- public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
- runTestNGramTokenizerWithHashedUTF8Tokens(false);
- runTestNGramTokenizerWithHashedUTF8Tokens(true);
- }
-
- @Test
- public void testNGramTokenizerWithUTF8Tokens() throws IOException {
- runTestNGramTokenizerWithUTF8Tokens(false);
- runTestNGramTokenizerWithUTF8Tokens(true);
- }
-
- public int tokenHash(String token, int tokenCount) {
- int h = AbstractUTF8Token.GOLDEN_RATIO_32;
- for (int i = 0; i < token.length(); i++) {
- h ^= token.charAt(i);
- h *= AbstractUTF8Token.GOLDEN_RATIO_32;
- }
- return h + tokenCount;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
----------------------------------------------------------------------
diff --git a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java b/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
deleted file mode 100644
index a4afe0c..0000000
--- a/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/tests/WordTokenizerTest.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tests;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import junit.framework.Assert;
-
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.AbstractUTF8Token;
-import org.apache.asterix.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.HashedUTF8WordTokenFactory;
-import org.apache.asterix.fuzzyjoin.tokenizer.IToken;
-import org.apache.asterix.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
-
-public class WordTokenizerTest {
-
- private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
- private byte[] inputBuffer;
-
- private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
- private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
- private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
-
- @Before
- public void init() throws IOException {
- // serialize text into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(text);
- inputBuffer = baos.toByteArray();
-
- // init expected string tokens
- expectedUTF8Tokens.add("hello");
- expectedUTF8Tokens.add("world");
- expectedUTF8Tokens.add("i");
- expectedUTF8Tokens.add("would");
- expectedUTF8Tokens.add("like");
- expectedUTF8Tokens.add("to");
- expectedUTF8Tokens.add("inform");
- expectedUTF8Tokens.add("you");
- expectedUTF8Tokens.add("of");
- expectedUTF8Tokens.add("the");
- expectedUTF8Tokens.add("importance");
- expectedUTF8Tokens.add("of");
- expectedUTF8Tokens.add("foo");
- expectedUTF8Tokens.add("bar");
- expectedUTF8Tokens.add("yes");
- expectedUTF8Tokens.add("foo");
- expectedUTF8Tokens.add("bar");
- expectedUTF8Tokens.add("jürgen");
-
- // hashed tokens ignoring token count
- for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
- int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
- expectedHashedUTF8Tokens.add(hash);
- }
-
- // hashed tokens using token count
- HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
- for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
- Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
- if (count == null) {
- count = 1;
- tokenCounts.put(expectedUTF8Tokens.get(i), count);
- } else {
- count++;
- }
-
- int hash = tokenHash(expectedUTF8Tokens.get(i), count);
- expectedCountedHashedUTF8Tokens.add(hash);
- }
- }
-
- @Test
- public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
-
- HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
- tokenFactory);
-
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- Integer hashedToken = in.readInt();
-
- // System.out.println(hashedToken);
-
- Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
-
- tokenCount++;
- }
- }
-
- @Test
- public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
-
- HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- Integer hashedToken = in.readInt();
-
- // System.out.println(hashedToken);
-
- Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
-
- tokenCount++;
- }
- }
-
- @Test
- public void testWordTokenizerWithUTF8Tokens() throws IOException {
-
- UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
-
- int tokenCount = 0;
-
- while (tokenizer.hasNext()) {
- tokenizer.next();
-
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
-
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
-
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
-
- String strToken = in.readUTF();
-
- // System.out.println(strToken);
-
- Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
-
- tokenCount++;
- }
- }
-
- // JAQL
- public int tokenHash(String token, int tokenCount) {
- int h = AbstractUTF8Token.GOLDEN_RATIO_32;
- for (int i = 0; i < token.length(); i++) {
- h ^= token.charAt(i);
- h *= AbstractUTF8Token.GOLDEN_RATIO_32;
- }
- return h + tokenCount;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
index b88ed3a..ac975c4 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalFileIndexAccessor.java
@@ -54,6 +54,7 @@ import org.apache.hyracks.storage.am.lsm.common.api.ILSMIndexAccessorInternal;
@SuppressWarnings({ "rawtypes", "unchecked" })
public class ExternalFileIndexAccessor implements Serializable {
+ private final FilesIndexDescription filesIndexDescription = new FilesIndexDescription();
private static final long serialVersionUID = 1L;
private ExternalBTreeDataflowHelper indexDataflowHelper;
private ExternalLoopkupOperatorDiscriptor opDesc;
@@ -119,7 +120,7 @@ public class ExternalFileIndexAccessor implements Serializable {
int recordLength = tuple.getFieldLength(FilesIndexDescription.FILE_PAYLOAD_INDEX);
ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
DataInput in = new DataInputStream(stream);
- ARecord externalFileRecord = (ARecord) FilesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in);
+ ARecord externalFileRecord = (ARecord) filesIndexDescription.EXTERNAL_FILE_RECORD_SERDE.deserialize(in);
setExternalFileFromARecord(externalFileRecord, file);
} else {
// This should never happen
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
index 07c8e5f..a7844ce 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/ExternalLoopkupOperatorDiscriptor.java
@@ -40,7 +40,6 @@ import org.apache.hyracks.storage.common.IStorageManagerInterface;
* This operator is intended for using record ids to access data in external sources
*/
public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperatorDescriptor {
-
private static final long serialVersionUID = 1L;
private final IControlledAdapterFactory adapterFactory;
private final INullWriterFactory iNullWriterFactory;
@@ -53,8 +52,8 @@ public class ExternalLoopkupOperatorDiscriptor extends AbstractTreeIndexOperator
ISearchOperationCallbackFactory searchOpCallbackFactory, boolean retainNull,
INullWriterFactory iNullWriterFactory) {
super(spec, 1, 1, outRecDesc, storageManager, lcManagerProvider, fileSplitProvider,
- FilesIndexDescription.EXTERNAL_FILE_INDEX_TYPE_TRAITS,
- FilesIndexDescription.FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS,
+ new FilesIndexDescription().EXTERNAL_FILE_INDEX_TYPE_TRAITS,
+ new FilesIndexDescription().FILES_INDEX_COMP_FACTORIES, FilesIndexDescription.BLOOM_FILTER_FIELDS,
externalFilesIndexDataFlowHelperFactory, null, propagateInput, retainNull, iNullWriterFactory, null,
searchOpCallbackFactory, null);
this.adapterFactory = adapterFactory;
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
index 0474ae5..cb4c0d2 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/external/FilesIndexDescription.java
@@ -41,58 +41,64 @@ public class FilesIndexDescription {
public final static int FILE_KEY_INDEX = 0;
public final static int FILE_KEY_SIZE = 1;
public final static int FILE_PAYLOAD_INDEX = 1;
- public static RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR;
- public static RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR;
public final static String[] payloadFieldNames = { "FileName", "FileSize", "FileModDate" };
public final static IAType[] payloadFieldTypes = { BuiltinType.ASTRING, BuiltinType.AINT64, BuiltinType.ADATETIME };
- public static ARecordType EXTERNAL_FILE_RECORD_TYPE;
- public static ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE;
- public static final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1];
- public static final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1];
- public static final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE];
- public static final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE];
- public static final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] { AqlBinaryComparatorFactoryProvider.INSTANCE
- .getBinaryComparatorFactory(BuiltinType.AINT32, true) };
+
public static final int[] BLOOM_FILTER_FIELDS = { 0 };
public static final int EXTERNAL_FILE_NAME_FIELD_INDEX = 0;
public static final int EXTERNAL_FILE_SIZE_FIELD_INDEX = 1;
public static final int EXTERNAL_FILE_MOD_DATE_FIELD_INDEX = 2;
- static {
- try {
- EXTERNAL_FILE_RECORD_TYPE = new ARecordType("ExternalFileRecordType", payloadFieldNames, payloadFieldTypes,
- true);
- EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE);
- EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
- EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE;
- EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+ public final ARecordType EXTERNAL_FILE_RECORD_TYPE;
+ public final ITypeTraits[] EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS = new ITypeTraits[1];
+ public final ITypeTraits[] EXTERNAL_FILE_INDEX_TYPE_TRAITS = new ITypeTraits[FILE_INDEX_TUPLE_SIZE];
- EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
- .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
- EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE
- .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE);
- EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
- .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+ public final ISerializerDeserializer EXTERNAL_FILE_RECORD_SERDE;
+ public final RecordDescriptor FILE_INDEX_RECORD_DESCRIPTOR;
+ public final RecordDescriptor FILE_BUDDY_BTREE_RECORD_DESCRIPTOR;
+ public final ISerializerDeserializer[] EXTERNAL_FILE_BUDDY_BTREE_FIELDS = new ISerializerDeserializer[1];
+ public final ISerializerDeserializer[] EXTERNAL_FILE_TUPLE_FIELDS = new ISerializerDeserializer[FILE_INDEX_TUPLE_SIZE];
+ public final IBinaryComparatorFactory[] FILES_INDEX_COMP_FACTORIES = new IBinaryComparatorFactory[] {
+ AqlBinaryComparatorFactoryProvider.INSTANCE.getBinaryComparatorFactory(BuiltinType.AINT32, true) };
- FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS,
- EXTERNAL_FILE_INDEX_TYPE_TRAITS);
-
- FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS,
- EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS);
+ public FilesIndexDescription() {
+ ARecordType type;
+ try {
+ type = new ARecordType("ExternalFileRecordType", payloadFieldNames,
+ payloadFieldTypes, true);
} catch (Exception e) {
e.printStackTrace();
- System.exit(1);
+ throw new RuntimeException(e);
}
+ EXTERNAL_FILE_RECORD_TYPE = type;
+ EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
+ .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+ EXTERNAL_FILE_INDEX_TYPE_TRAITS[FILE_PAYLOAD_INDEX] = AqlTypeTraitProvider.INSTANCE
+ .getTypeTrait(EXTERNAL_FILE_RECORD_TYPE);
+ EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS[FILE_KEY_INDEX] = AqlTypeTraitProvider.INSTANCE
+ .getTypeTrait(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+
+ EXTERNAL_FILE_RECORD_SERDE = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(EXTERNAL_FILE_RECORD_TYPE);
+
+ EXTERNAL_FILE_TUPLE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+ EXTERNAL_FILE_TUPLE_FIELDS[FILE_PAYLOAD_INDEX] = EXTERNAL_FILE_RECORD_SERDE;
+ EXTERNAL_FILE_BUDDY_BTREE_FIELDS[FILE_KEY_INDEX] = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(IndexingConstants.FILE_NUMBER_FIELD_TYPE);
+
+ FILE_INDEX_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_TUPLE_FIELDS,
+ EXTERNAL_FILE_INDEX_TYPE_TRAITS);
+
+ FILE_BUDDY_BTREE_RECORD_DESCRIPTOR = new RecordDescriptor(EXTERNAL_FILE_BUDDY_BTREE_FIELDS,
+ EXTERNAL_FILE_BUDDY_BTREE_TYPE_TRAITS);
}
@SuppressWarnings("unchecked")
- public static void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder,
+ public void getBuddyBTreeTupleFromFileNumber(ArrayTupleReference tuple, ArrayTupleBuilder tupleBuilder,
AMutableInt32 aInt32) throws IOException, AsterixException {
tupleBuilder.reset();
- FilesIndexDescription.FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32,
+ FILE_BUDDY_BTREE_RECORD_DESCRIPTOR.getFields()[0].serialize(aInt32,
tupleBuilder.getDataOutput());
tupleBuilder.addFieldEndOffset();
tuple.reset(tupleBuilder.getFieldEndOffsets(), tupleBuilder.getByteArray());
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
index a1a5b5c..e79fe1c 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatasetNameValueExtractor.java
@@ -25,6 +25,7 @@ import java.io.DataInputStream;
import org.apache.asterix.common.transactions.JobId;
import org.apache.asterix.dataflow.data.nontagged.serde.AObjectSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AStringSerializerDeserializer;
import org.apache.asterix.metadata.MetadataException;
import org.apache.asterix.metadata.api.IValueExtractor;
import org.apache.asterix.om.base.AString;
@@ -36,6 +37,8 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
* contains a serialized representation of a Dataset metadata entity.
*/
public class DatasetNameValueExtractor implements IValueExtractor<String> {
+ private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
+
@Override
public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException {
byte[] serRecord = tuple.getFieldData(2);
@@ -43,6 +46,6 @@ public class DatasetNameValueExtractor implements IValueExtractor<String> {
int recordLength = tuple.getFieldLength(2);
ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
DataInput in = new DataInputStream(stream);
- return (((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue());
+ return (((AString) aObjSerDer.deserialize(in)).getStringValue());
}
}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
index 9d5e8b1..9a50a31 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/DatatypeNameValueExtractor.java
@@ -40,6 +40,7 @@ import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
public class DatatypeNameValueExtractor implements IValueExtractor<String> {
private final String dataverseName;
private final MetadataNode metadataNode;
+ private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
public DatatypeNameValueExtractor(String dataverseName, MetadataNode metadataNode) {
this.dataverseName = dataverseName;
@@ -53,7 +54,7 @@ public class DatatypeNameValueExtractor implements IValueExtractor<String> {
int recordLength = tuple.getFieldLength(2);
ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
DataInput in = new DataInputStream(stream);
- String typeName = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+ String typeName = ((AString) aObjSerDer.deserialize(in)).getStringValue();
try {
if (metadataNode.getDatatype(jobId, dataverseName, typeName).getIsAnonymous()) {
// Get index 0 because it is anonymous type, and it is used in
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
----------------------------------------------------------------------
diff --git a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
index d046650..41d92c9 100644
--- a/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
+++ b/asterix-metadata/src/main/java/org/apache/asterix/metadata/valueextractors/NestedDatatypeNameValueExtractor.java
@@ -43,6 +43,7 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor<String>
public NestedDatatypeNameValueExtractor(String datatypeName) {
this.datatypeName = datatypeName;
}
+ private final AObjectSerializerDeserializer aObjSerDer = new AObjectSerializerDeserializer();
@Override
public String getValue(JobId jobId, ITupleReference tuple) throws MetadataException, HyracksDataException {
@@ -51,13 +52,13 @@ public class NestedDatatypeNameValueExtractor implements IValueExtractor<String>
int recordLength = tuple.getFieldLength(2);
ByteArrayInputStream stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
DataInput in = new DataInputStream(stream);
- String nestedType = ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+ String nestedType = ((AString) aObjSerDer.deserialize(in)).getStringValue();
if (nestedType.equals(datatypeName)) {
recordStartOffset = tuple.getFieldStart(1);
recordLength = tuple.getFieldLength(1);
stream = new ByteArrayInputStream(serRecord, recordStartOffset, recordLength);
in = new DataInputStream(stream);
- return ((AString) AObjectSerializerDeserializer.INSTANCE.deserialize(in)).getStringValue();
+ return ((AString) aObjSerDer.deserialize(in)).getStringValue();
}
return null;
}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
index d664e12..55ff32f 100644
--- a/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
+++ b/asterix-om/src/main/java/org/apache/asterix/builders/RecordBuilder.java
@@ -45,6 +45,7 @@ public class RecordBuilder implements IARecordBuilder {
private final static int DEFAULT_NUM_OPEN_FIELDS = 10;
private final static byte SER_NULL_TYPE_TAG = ATypeTag.NULL.serialize();
private final static byte RECORD_TYPE_TAG = ATypeTag.RECORD.serialize();
+ private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
private int openPartOffsetArraySize;
private byte[] openPartOffsetArray;
@@ -226,9 +227,8 @@ public class RecordBuilder implements IARecordBuilder {
for (int i = 1; i < numberOfOpenFields; i++) {
if (utf8Comparator.compare(openBytes, (int) openPartOffsets[i - 1], openFieldNameLengths[i - 1],
openBytes, (int) openPartOffsets[i], openFieldNameLengths[i]) == 0) {
- String field = UTF8StringSerializerDeserializer.INSTANCE
- .deserialize(new DataInputStream(new ByteArrayInputStream(openBytes,
- (int) openPartOffsets[i], openFieldNameLengths[i])));
+ String field = utf8SerDer.deserialize(new DataInputStream(new ByteArrayInputStream(openBytes,
+ (int) openPartOffsets[i], openFieldNameLengths[i])));
throw new AsterixException("Open fields " + (i - 1) + " and " + i
+ " have the same field name \"" + field + "\"");
}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
index f019f10..a3bff52 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AListElementToken.java
@@ -26,8 +26,8 @@ import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
public class AListElementToken implements IToken {
protected byte[] data;
- protected int start;
- protected int length;
+ protected int startOffset;
+ protected int endOffset;
protected int tokenLength;
protected int typeTag;
@@ -37,13 +37,13 @@ public class AListElementToken implements IToken {
}
@Override
- public int getLength() {
- return length;
+ public int getEndOffset() {
+ return endOffset;
}
@Override
- public int getStart() {
- return start;
+ public int getStartOffset() {
+ return startOffset;
}
@Override
@@ -52,10 +52,10 @@ public class AListElementToken implements IToken {
}
@Override
- public void reset(byte[] data, int start, int length, int tokenLength, int tokenCount) {
+ public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) {
this.data = data;
- this.start = start;
- this.length = length;
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
this.tokenLength = tokenLength;
// We abuse the last param, tokenCount, to pass the type tag.
typeTag = tokenCount;
@@ -64,7 +64,7 @@ public class AListElementToken implements IToken {
@Override
public void serializeToken(GrowableArray out) throws IOException {
out.getDataOutput().writeByte(typeTag);
- out.getDataOutput().write(data, start, length);
+ out.getDataOutput().write(data, startOffset, endOffset - startOffset);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
index 5f6e0b8..32207d3 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
@@ -59,9 +59,10 @@ public class AOrderedListBinaryTokenizer implements IBinaryTokenizer {
itemOffset = getItemOffset(data, start, itemIndex);
// Assuming homogeneous list.
ATypeTag typeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(data[start + 1]);
+ // ? Can we handle the non-string type ?
length = NonTaggedFormatUtil.getFieldValueLength(data, itemOffset, typeTag, false);
// Last param is a hack to pass the type tag.
- token.reset(data, itemOffset, length, length, data[start + 1]);
+ token.reset(data, itemOffset, itemOffset + length, length, data[start + 1]);
} catch (AsterixException e) {
throw new IllegalStateException(e);
}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
index 64b5610..767a343 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
@@ -19,7 +19,6 @@
package org.apache.asterix.dataflow.data.nontagged.comparators;
-import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable;
import org.apache.asterix.om.types.ATypeTag;
import org.apache.asterix.om.types.EnumDeserializer;
import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
@@ -31,6 +30,7 @@ import org.apache.hyracks.data.std.primitive.DoublePointable;
import org.apache.hyracks.data.std.primitive.FloatPointable;
import org.apache.hyracks.data.std.primitive.IntegerPointable;
import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable;
public class ListItemBinaryComparatorFactory implements IBinaryComparatorFactory {
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
index 6935f24..493833b 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
@@ -21,7 +21,6 @@ package org.apache.asterix.dataflow.data.nontagged.hash;
import java.io.IOException;
-import org.apache.asterix.formats.nontagged.UTF8StringLowercasePointable;
import org.apache.asterix.om.types.ATypeTag;
import org.apache.asterix.om.types.EnumDeserializer;
import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
@@ -29,6 +28,7 @@ import org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.data.std.accessors.MurmurHash3BinaryHashFunctionFamily;
import org.apache.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
+import org.apache.hyracks.data.std.primitive.UTF8StringLowercasePointable;
import org.apache.hyracks.data.std.util.GrowableArray;
/**
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb/blob/742aba85/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
----------------------------------------------------------------------
diff --git a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
index 596e168..7d88a90 100644
--- a/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
+++ b/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
@@ -29,8 +29,13 @@ import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.bytes.HexPrinter;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
public class PrintTools {
+
private static final GregorianCalendarSystem gCalInstance = GregorianCalendarSystem.getInstance();
private static long CHRONON_OF_DAY = 24 * 60 * 60 * 1000;
@@ -185,13 +190,13 @@ public class PrintTools {
}
public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException {
- int stringLength = UTF8StringPointable.getUTFLength(b, s);
- int position = s + 2; // skip 2 bytes containing string size
+ int stringLength = UTF8StringUtil.getUTFLength(b, s);
+ int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength);
int maxPosition = position + stringLength;
os.write('"');
while (position < maxPosition) {
- char c = UTF8StringPointable.charAt(b, position);
- int sz = UTF8StringPointable.charSize(b, position);
+ char c = UTF8StringUtil.charAt(b, position);
+ int sz = UTF8StringUtil.charSize(b, position);
if (c == '"') {
os.write('"');
}
@@ -202,13 +207,13 @@ public class PrintTools {
}
public static void writeUTF8StringAsJSON(byte[] b, int s, int l, OutputStream os) throws IOException {
- int stringLength = UTF8StringPointable.getUTFLength(b, s);
- int position = s + 2; // skip 2 bytes containing string size
- int maxPosition = position + stringLength;
+ int utfLength = UTF8StringUtil.getUTFLength(b, s);
+ int position = s + UTF8StringUtil.getNumBytesToStoreLength(utfLength); // skip 2 bytes containing string size
+ int maxPosition = position + utfLength;
os.write('"');
while (position < maxPosition) {
- char c = UTF8StringPointable.charAt(b, position);
- int sz = UTF8StringPointable.charSize(b, position);
+ char c = UTF8StringUtil.charAt(b, position);
+ int sz = UTF8StringUtil.charSize(b, position);
switch (c) {
// escape
case '\b':
@@ -296,27 +301,9 @@ public class PrintTools {
os.write('u');
os.write('0');
os.write('0');
- os.write(hex((c >>> 4) & 0x0f, CASE.LOWER_CASE));
- os.write(hex(c & 0x0f, CASE.LOWER_CASE));
+ os.write(HexPrinter.hex((c >>> 4) & 0x0f, HexPrinter.CASE.LOWER_CASE));
+ os.write(HexPrinter.hex(c & 0x0f, HexPrinter.CASE.LOWER_CASE));
}
- public static Appendable printHexString(byte[] bytes, int start, int length, Appendable appendable)
- throws IOException {
- for (int i = 0; i < length; ++i) {
- appendable.append((char) hex((bytes[start + i] >>> 4) & 0x0f, CASE.UPPER_CASE));
- appendable.append((char) hex((bytes[start + i] & 0x0f), CASE.UPPER_CASE));
- }
- return appendable;
- }
-
- public static byte hex(int i, CASE c) {
- switch (c) {
- case LOWER_CASE:
- return (byte) (i < 10 ? i + '0' : i + ('a' - 10));
- case UPPER_CASE:
- return (byte) (i < 10 ? i + '0' : i + ('A' - 10));
- }
- return Byte.parseByte(null);
- }
}