You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@asterixdb.apache.org by im...@apache.org on 2020/04/17 07:32:43 UTC

[asterixdb] branch master updated (6e6b342 -> f85cee9)

This is an automated email from the ASF dual-hosted git repository.

imaxon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git.


    from 6e6b342  [ASTERIXDB-2708] Introduce batch and stateful point cursors
     new 3d0897b  [ASTERIXDB-2709][COMP] Fix registration of st_union aggregates
     new 934ce9d  [NO ISSUE] Rename tokenizer factories
     new f85cee9  [NO ISSUE] Remove out-of-date tokenizer

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../two-geometries/aggregate.17.query.sqlpp}       |   4 +
 .../results/geojson/two-geometries/result.17.adm   |   1 +
 .../asterix/fuzzyjoin/FuzzyJoinAppendLength.java   |  58 ---------
 .../asterix/fuzzyjoin/FuzzyJoinTokenize.java       | 133 ---------------------
 .../fuzzyjoin/similarity/SimilarityMetric.java     |   4 -
 .../similarity/SimilarityMetricJaccard.java        |  21 ----
 .../fuzzyjoin/tokenizer/NGramTokenizer.java        |  90 --------------
 .../apache/asterix/fuzzyjoin/tokenizer/Token.java  | 118 ------------------
 .../asterix/fuzzyjoin/tokenizer/Tokenizer.java     |  27 -----
 .../fuzzyjoin/tokenizer/TokenizerBuffered.java     |  30 -----
 .../tokenizer/TokenizerBufferedFactory.java        |  34 ------
 .../fuzzyjoin/tokenizer/TokenizerFactory.java      |  31 -----
 .../asterix/fuzzyjoin/tokenizer/WordTokenizer.java |  68 -----------
 .../fuzzyjoin/tokenizer/WordTokenizerBuffered.java |  92 --------------
 .../fuzzyjoin/tokenorder/IntTokenCountRank.java    |  28 -----
 .../tokenorder/IntTokenCountRankFrequency.java     |  58 ---------
 .../asterix/fuzzyjoin/tokenorder/IntTokenRank.java |  28 -----
 .../tokenorder/IntTokenRankFrequency.java          |  54 ---------
 .../asterix/fuzzyjoin/tokenorder/TokenLoad.java    |  61 ----------
 .../asterix/fuzzyjoin/tokenorder/TokenRank.java    |  31 -----
 .../tokenorder/TokenRankBufferedFrequency.java     |  75 ------------
 .../fuzzyjoin/tokenorder/TokenRankFrequency.java   |  61 ----------
 .../nontagged/BinaryTokenizerFactoryProvider.java  |  27 +++--
 .../asterix/om/functions/BuiltinFunctions.java     |  28 ++++-
 24 files changed, 44 insertions(+), 1118 deletions(-)
 copy asterixdb/asterix-app/src/test/resources/{metadata/queries/basic/dataset_with_meta-1/dataset_with_meta-1.2.update.aql => runtimets/queries_sqlpp/geojson/two-geometries/aggregate.17.query.sqlpp} (87%)
 create mode 100644 asterixdb/asterix-app/src/test/resources/runtimets/results/geojson/two-geometries/result.17.adm
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
 delete mode 100644 asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java

[asterixdb] 03/03: [NO ISSUE] Remove out-of-date tokenizer

Posted by im...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

imaxon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit f85cee9e60c4a68bc2a7de87284c6b812794f76b
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Mon Apr 13 10:04:19 2020 -0700

    [NO ISSUE] Remove out-of-date tokenizer
    
    The string-based Tokenizer should be replaced with the array-based
    IBinaryTokenizer. The Tokenizer is not used in the codebase in a
    meaningful way, so let's remove it to make things clear.
    
    Change-Id: I483604bf2a5e20c18f6224ac2a153667828dabfb
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5763
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ian Maxon <im...@uci.edu>
---
 .../asterix/fuzzyjoin/FuzzyJoinAppendLength.java   |  58 ---------
 .../asterix/fuzzyjoin/FuzzyJoinTokenize.java       | 133 ---------------------
 .../fuzzyjoin/similarity/SimilarityMetric.java     |   4 -
 .../similarity/SimilarityMetricJaccard.java        |  21 ----
 .../fuzzyjoin/tokenizer/NGramTokenizer.java        |  90 --------------
 .../apache/asterix/fuzzyjoin/tokenizer/Token.java  | 118 ------------------
 .../asterix/fuzzyjoin/tokenizer/Tokenizer.java     |  27 -----
 .../fuzzyjoin/tokenizer/TokenizerBuffered.java     |  30 -----
 .../tokenizer/TokenizerBufferedFactory.java        |  34 ------
 .../fuzzyjoin/tokenizer/TokenizerFactory.java      |  31 -----
 .../asterix/fuzzyjoin/tokenizer/WordTokenizer.java |  68 -----------
 .../fuzzyjoin/tokenizer/WordTokenizerBuffered.java |  92 --------------
 .../fuzzyjoin/tokenorder/IntTokenCountRank.java    |  28 -----
 .../tokenorder/IntTokenCountRankFrequency.java     |  58 ---------
 .../asterix/fuzzyjoin/tokenorder/IntTokenRank.java |  28 -----
 .../tokenorder/IntTokenRankFrequency.java          |  54 ---------
 .../asterix/fuzzyjoin/tokenorder/TokenLoad.java    |  61 ----------
 .../asterix/fuzzyjoin/tokenorder/TokenRank.java    |  31 -----
 .../tokenorder/TokenRankBufferedFrequency.java     |  75 ------------
 .../fuzzyjoin/tokenorder/TokenRankFrequency.java   |  61 ----------
 20 files changed, 1102 deletions(-)

diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
deleted file mode 100644
index 8be6f0c..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
-
-public class FuzzyJoinAppendLength {
-    public static void main(String args[]) throws IOException {
-        final String inputFileName = args[0];
-        final String outputFileName = args[1];
-
-        BufferedReader input = new BufferedReader(new FileReader(inputFileName));
-        BufferedWriter output = new BufferedWriter(new FileWriter(outputFileName));
-
-        Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
-                FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
-
-        int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
-
-        String line;
-        while ((line = input.readLine()) != null) {
-            String[] splits = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
-            Collection<String> tokens =
-                    tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
-            output.write(splits[0] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[1] + FuzzyJoinConfig.RECORD_SEPARATOR
-                    + splits[2] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[3] + FuzzyJoinConfig.RECORD_SEPARATOR
-                    + tokens.size() + "\n");
-        }
-
-        input.close();
-        output.close();
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java
deleted file mode 100644
index 4c85f25..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
-import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenLoad;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenRank;
-import org.apache.asterix.fuzzyjoin.tokenorder.TokenRankFrequency;
-
-public class FuzzyJoinTokenize {
-    public static class TokenCount implements Comparable<Object> {
-        public String token;
-        public MutableInteger count;
-
-        public TokenCount(String token, MutableInteger count) {
-            this.token = token;
-            this.count = count;
-        }
-
-        @Override
-        public int compareTo(Object o) {
-            TokenCount tc = (TokenCount) o;
-            return count.compareTo(tc.count);
-        }
-
-        public String getToken() {
-            return token;
-        }
-
-        @Override
-        public String toString() {
-            return token + " " + count;
-        }
-    }
-
-    public static void main(String args[]) throws IOException {
-        final String inputFileName = args[0];
-        final String tokensFileName = args[1];
-        final String tokenizedFileName = args[2];
-
-        BufferedReader input = new BufferedReader(new FileReader(inputFileName));
-
-        Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE,
-                FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
-
-        int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
-
-        String line;
-        HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
-        while ((line = input.readLine()) != null) {
-            Collection<String> tokens =
-                    tokenizer.tokenize(FuzzyJoinUtil.getData(line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX),
-                            dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
-
-            for (String token : tokens) {
-                MutableInteger count = tokenCount.get(token);
-                if (count == null) {
-                    tokenCount.put(token, new MutableInteger(1));
-                } else {
-                    count.inc();
-                }
-            }
-        }
-
-        input.close();
-
-        ArrayList<TokenCount> tokenCounts = new ArrayList<TokenCount>();
-        tokenCount.forEach((key, value) -> tokenCounts.add(new TokenCount(key, value)));
-        Collections.sort(tokenCounts);
-
-        BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName));
-        for (TokenCount tc : tokenCounts) {
-            outputTokens.write(tc.getToken() + "\n");
-        }
-        outputTokens.close();
-
-        TokenRank tokenRank = new TokenRankFrequency();
-        TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRank);
-        tokenLoad.loadTokenRank();
-
-        input = new BufferedReader(new FileReader(inputFileName));
-        LittleEndianIntOutputStream outputTokenized =
-                new LittleEndianIntOutputStream(new BufferedOutputStream(new FileOutputStream(tokenizedFileName)));
-        while ((line = input.readLine()) != null) {
-            String splits[] = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
-            int rid = Integer.parseInt(splits[FuzzyJoinConfig.RECORD_KEY]);
-            outputTokenized.writeInt(rid);
-            Collection<String> tokens =
-                    tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR));
-            Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens);
-            outputTokenized.writeInt(tokensRanked.size());
-            for (Integer token : tokensRanked) {
-                outputTokenized.writeInt(token);
-            }
-            // for (int i = 0; i < tokens.size() - tokensRanked.size(); i++) {
-            // outputTokenized.writeInt(Integer.MAX_VALUE);
-            // }
-        }
-
-        input.close();
-        outputTokenized.close();
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index 3348d4c..1133246 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -19,7 +19,6 @@
 
 package org.apache.asterix.fuzzyjoin.similarity;
 
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.data.std.util.ISequenceIterator;
 
@@ -118,7 +117,4 @@ public abstract class SimilarityMetric {
 
     public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
 
-    public abstract float getSimilarity(int[] tokensX, int[] tokensY);
-
-    public abstract float getSimilarity(String stringX, String stringY, Tokenizer tokenizer);
 }
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index 63d3077..f72400f 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -22,7 +22,6 @@ package org.apache.asterix.fuzzyjoin.similarity;
 import java.util.Set;
 import java.util.TreeSet;
 
-import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.data.std.util.ISequenceIterator;
 
@@ -82,24 +81,4 @@ public class SimilarityMetricJaccard extends SimilarityMetric implements IGeneri
         return (float) intersectionSize / (totalSize - intersectionSize);
     }
 
-    @Override
-    public float getSimilarity(int[] tokensX, int[] tokensY) {
-        return getSimilarity(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length);
-    }
-
-    @Override
-    public float getSimilarity(String stringX, String stringY, Tokenizer tokenizer) {
-        Set<String> setX = new TreeSet<String>();
-        for (String token : tokenizer.tokenize(stringX)) {
-            setX.add(token);
-        }
-        Set<String> setY = new TreeSet<String>();
-        for (String token : tokenizer.tokenize(stringY)) {
-            setY.add(token);
-        }
-        int lengthX = setX.size();
-        int lengthY = setY.size();
-        setX.retainAll(setY);
-        return ((float) setX.size()) / (lengthX + lengthY - setX.size());
-    }
 }
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
deleted file mode 100644
index 5594e43..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-public class NGramTokenizer implements Tokenizer {
-
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    public static void main(String args[]) {
-        Tokenizer tokenizer = new NGramTokenizer();
-        String a = "hadoopoop";
-        System.out.println(a + ":" + tokenizer.tokenize(a));
-    }
-
-    private final int gramLength;
-
-    /**
-     * padding used in q gram calculation.
-     */
-    private final char QGRAMENDPADDING = '$';
-
-    /**
-     * padding used in q gram calculation.
-     */
-    private final char QGRAMSTARTPADDING = '$';
-
-    public NGramTokenizer() {
-        gramLength = 3;
-    }
-
-    public NGramTokenizer(int gramLength) {
-        this.gramLength = gramLength;
-    }
-
-    private StringBuffer getAdjustedString(String input) {
-        final StringBuffer adjustedString = new StringBuffer();
-        for (int i = 0; i < gramLength - 1; i++) {
-            adjustedString.append(QGRAMSTARTPADDING);
-        }
-        adjustedString.append(input);
-        for (int i = 0; i < gramLength - 1; i++) {
-            adjustedString.append(QGRAMENDPADDING);
-        }
-        return adjustedString;
-    }
-
-    public List<String> tokenize(String input) {
-        final ArrayList<String> returnVect = new ArrayList<String>();
-        final StringBuffer adjustedString = getAdjustedString(input);
-        int curPos = 0;
-        final int length = adjustedString.length() - (gramLength - 1);
-        final HashMap<String, Integer> grams = new HashMap<String, Integer>();
-        while (curPos < length) {
-            final String term = adjustedString.substring(curPos, curPos + gramLength);
-            Integer count = grams.get(term);
-            if (count == null) {
-                count = new Integer(0);
-            }
-            count++;
-            grams.put(term, count);
-            returnVect.add(term + count);
-            curPos++;
-        }
-        return returnVect;
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java
deleted file mode 100644
index 720d269..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-
-public class Token implements Serializable {
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    private CharSequence data;
-    private int start;
-    private int length;
-    private int count;
-
-    /** Cache the hash code for the string */
-    private int hash; // Default to 0
-
-    public Token() {
-    }
-
-    public Token(CharSequence data, int start, int length, int count) {
-        set(data, start, length, count);
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (o == null) {
-            return false;
-        }
-        if (!(o instanceof Token)) {
-            return false;
-        }
-        Token t = (Token) o;
-        if (t.length != length) {
-            return false;
-        }
-        for (int i = 0; i < length; i++) {
-            if (t.data.charAt(t.start + i) != data.charAt(start + i)) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    public CharSequence getCharSequence() {
-        return data;
-    }
-
-    public int getCount() {
-        return count;
-    }
-
-    public int getLength() {
-        return length;
-    }
-
-    public int getStart() {
-        return start;
-    }
-
-    @Override
-    public int hashCode() {
-        int h = hash;
-        if (h == 0 && length > 0) {
-            for (int i = 0; i < length; i++) {
-                h = 31 * h + data.charAt(start + i);
-            }
-            h = 31 * h + count;
-            hash = h;
-        }
-        return h;
-    }
-
-    public int length() {
-        return length;
-    }
-
-    public void set(CharSequence data, int start, int length, int count) {
-        this.data = data;
-        this.start = start;
-        this.length = length;
-        this.count = count;
-        hash = 0;
-    }
-
-    public void set(String data, int count) {
-        this.data = data;
-        start = 0;
-        length = data.length();
-        this.count = count;
-        hash = 0;
-    }
-
-    @Override
-    public String toString() {
-        return "(" + data.subSequence(start, start + length) + ", " + count + ")";
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java
deleted file mode 100644
index 71078d5..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.io.Serializable;
-import java.util.List;
-
-public interface Tokenizer extends Serializable {
-    public List<String> tokenize(String text);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
deleted file mode 100644
index 19fcf18..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public interface TokenizerBuffered {
-    public void advance();
-
-    public boolean end();
-
-    public Token getToken();
-
-    public void reset();
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
deleted file mode 100644
index 2f4e8c6..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class TokenizerBufferedFactory {
-    public static TokenizerBuffered getTokenizer(String tokenizer, StringBuilder buffer) {
-        if (tokenizer.equals("Word")) {
-            return new WordTokenizerBuffered(buffer);
-        }
-        throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
-    }
-
-    public static boolean isSeparator(char c) {
-        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER
-                || Character.getType(c) == Character.OTHER_NUMBER);
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
deleted file mode 100644
index 9b1856a..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-public class TokenizerFactory {
-    public static Tokenizer getTokenizer(String tokenizer, String wordSeparator, char tokenSeparator) {
-        if (tokenizer.equals("NGram")) {
-            return new NGramTokenizer();
-        } else if (tokenizer.equals("Word")) {
-            return new WordTokenizer(wordSeparator, tokenSeparator);
-        }
-        throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\".");
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
deleted file mode 100644
index fa0bfe7..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-public class WordTokenizer implements Tokenizer {
-
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    public static void main(String args[]) {
-        Tokenizer tokenizer = new WordTokenizer("_", '_');
-        String a = "hadoop_rocks_in_java";
-        System.out.println(a + ":" + tokenizer.tokenize(a));
-    }
-
-    private final String wordSeparator;
-    private final char tokenSeparator;
-
-    public WordTokenizer() {
-        this(" ", '_');
-    }
-
-    public WordTokenizer(String wordSeparator, char tokenSeparator) {
-        this.wordSeparator = wordSeparator;
-        this.tokenSeparator = tokenSeparator;
-    }
-
-    public List<String> tokenize(String input) {
-        final ArrayList<String> returnVect = new ArrayList<String>();
-        final HashMap<String, Integer> tokens = new HashMap<String, Integer>();
-        for (String term : input.split(wordSeparator)) {
-            if (term.length() == 0) {
-                continue;
-            }
-            Integer count = tokens.get(term);
-            if (count == null) {
-                count = 0;
-            }
-            count++;
-            tokens.put(term, count);
-            returnVect.add(term + tokenSeparator + count);
-        }
-        return returnVect;
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
deleted file mode 100644
index 29206f9..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenizer;
-
-import org.apache.asterix.fuzzyjoin.IntArray;
-
-public class WordTokenizerBuffered implements TokenizerBuffered {
-
-    private final StringBuilder buffer;
-    private int index;
-    private final Token token;
-
-    private final IntArray tokensStart, tokensLength;
-
-    public WordTokenizerBuffered(StringBuilder buffer) {
-        this.buffer = buffer;
-        token = new Token();
-        tokensStart = new IntArray();
-        tokensLength = new IntArray();
-        reset();
-    }
-
-    @Override
-    public void advance() {
-        while (index < buffer.length() && TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
-            index++;
-        }
-        int start = index;
-        while (index < buffer.length() && !TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) {
-            buffer.setCharAt(index, Character.toLowerCase(buffer.charAt(index)));
-            index++;
-        }
-        int length = index - start;
-        int count = 1;
-        if (length > 0) {
-            // search if we got the same token before
-            for (int i = 0; i < tokensStart.length(); ++i) {
-                if (length == tokensLength.get(i)) {
-                    int tokenStart = tokensStart.get(i);
-                    count++; // assume we found it
-                    for (int j = 0; j < length; ++j) {
-                        if (buffer.charAt(start + j) != buffer.charAt(tokenStart + j)) {
-                            count--; // token not found
-                            break;
-                        }
-                    }
-                }
-            }
-            // add the new token to the list of seen tokens
-            tokensStart.add(start);
-            tokensLength.add(length);
-        }
-        // set token
-        token.set(buffer, start, length, count);
-    }
-
-    @Override
-    public boolean end() {
-        return token.length() <= 0;
-    }
-
-    @Override
-    public Token getToken() {
-        return token;
-    }
-
-    @Override
-    public void reset() {
-        index = 0;
-        tokensStart.reset();
-        tokensLength.reset();
-        advance();
-    }
-
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
deleted file mode 100644
index 90f8c6a..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-
-public interface IntTokenCountRank extends Serializable {
-    public int add(int token, int count);
-
-    public int getRank(int token, int count);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
deleted file mode 100644
index d54c7d6..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.IntPair;
-
-public class IntTokenCountRankFrequency implements IntTokenCountRank {
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    private final HashMap<IntPair, Integer> ranksMap = new HashMap<IntPair, Integer>();
-    private final IntPair tmpPair = new IntPair();
-    private int crtRank = 0;
-
-    @Override
-    public int add(int token, int count) {
-        int prevRank = crtRank;
-        ranksMap.put(new IntPair(token, count), prevRank);
-        crtRank++;
-        return prevRank;
-    }
-
-    @Override
-    public int getRank(int token, int count) {
-        tmpPair.set(token, count);
-        Integer rank = ranksMap.get(tmpPair);
-        if (rank == null) {
-            return -1;
-        }
-        return rank;
-    }
-
-    @Override
-    public String toString() {
-        return "[" + crtRank + ",\n " + ranksMap + "\n]";
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
deleted file mode 100644
index b8e2082..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-
-public interface IntTokenRank extends Serializable {
-    public int add(int token);
-
-    public int getRank(int token);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
deleted file mode 100644
index 08d1c93..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.HashMap;
-
-public class IntTokenRankFrequency implements IntTokenRank {
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    private final HashMap<Integer, Integer> ranksMap = new HashMap<Integer, Integer>();
-    private int crtRank = 0;
-
-    @Override
-    public int add(int token) {
-        int prevRank = crtRank;
-        ranksMap.put(token, prevRank);
-        crtRank++;
-        return prevRank;
-    }
-
-    @Override
-    public int getRank(int token) {
-        Integer rank = ranksMap.get(token);
-        if (rank == null) {
-            return -1;
-        }
-        return rank;
-    }
-
-    @Override
-    public String toString() {
-        return "[" + crtRank + ",\n " + ranksMap + "\n]";
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java
deleted file mode 100644
index 3578d94..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Serializable;
-
-import org.apache.asterix.fuzzyjoin.FuzzyJoinConfig;
-
-public class TokenLoad implements Serializable {
-    private final String path;
-    private final TokenRank rank;
-
-    public TokenLoad(String path, TokenRank rank) {
-        this.path = path;
-        this.rank = rank;
-    }
-
-    public void loadTokenRank() {
-        loadTokenRank(1);
-    }
-
-    public void loadTokenRank(int factor) {
-        try (BufferedReader fis = new BufferedReader(
-                // new FileReader(path.toString())
-                new InputStreamReader(new FileInputStream(path), "UTF-8"))) {
-            String token = null;
-            while ((token = fis.readLine()) != null) {
-                rank.add(token);
-                // only used when increasing the token dictionary
-                for (int i = 1; i < factor; i++) {
-                    // remove _COUNT at the end of the token (it is removed in
-                    // the new records anyway)
-                    rank.add(token.split(FuzzyJoinConfig.TOKEN_SEPARATOR_REGEX)[0] + i);
-                }
-            }
-        } catch (IOException ioe) {
-            throw new RuntimeException(ioe);
-        }
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java
deleted file mode 100644
index 42cdfa7..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.io.Serializable;
-import java.util.Collection;
-
-public interface TokenRank extends Serializable {
-    public int add(String token);
-
-    public Integer getRank(String token);
-
-    public Collection<Integer> getTokenRanks(Iterable<String> tokens);
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
deleted file mode 100644
index 57fc325..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.Collection;
-import java.util.HashMap;
-
-import org.apache.asterix.fuzzyjoin.tokenizer.Token;
-
-public class TokenRankBufferedFrequency implements TokenRank {
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    private final HashMap<Token, Integer> ranksMap = new HashMap<Token, Integer>();
-    private int crtRank = 0;
-
-    public int add(String stringWithCount) {
-        int end = stringWithCount.lastIndexOf('_');
-        int count = 0;
-        for (int i = end + 1; i < stringWithCount.length(); ++i) {
-            count = count * 10 + (stringWithCount.charAt(i) - '0');
-        }
-        return add(stringWithCount.substring(0, end), count);
-    }
-
-    public int add(String string, int count) {
-        Token token = new Token(string, 0, string.length(), count);
-        return add(token);
-    }
-
-    public int add(Token token) {
-        int prevRank = crtRank;
-        ranksMap.put(token, prevRank);
-        crtRank++;
-        return prevRank;
-    }
-
-    @Override
-    public Integer getRank(String token) {
-        throw new UnsupportedOperationException();
-    }
-
-    public Integer getRank(Token token) {
-        return ranksMap.get(token);
-    }
-
-    @Override
-    public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public String toString() {
-        return "[" + crtRank + ",\n " + ranksMap + "\n]";
-    }
-}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
deleted file mode 100644
index 97b9503..0000000
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.asterix.fuzzyjoin.tokenorder;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.TreeSet;
-
-public class TokenRankFrequency implements TokenRank {
-    /**
-     *
-     */
-    private static final long serialVersionUID = 1L;
-
-    private final HashMap<String, Integer> ranksMap = new HashMap<String, Integer>();
-    private int crtRank = 0;
-
-    public int add(String token) {
-        int prevRank = crtRank;
-        ranksMap.put(token, prevRank);
-        crtRank++;
-        return prevRank;
-    }
-
-    public Integer getRank(String token) {
-        return ranksMap.get(token);
-    }
-
-    public Collection<Integer> getTokenRanks(Iterable<String> tokens) {
-        TreeSet<Integer> ranksCol = new TreeSet<Integer>();
-        for (String token : tokens) {
-            Integer rank = getRank(token);
-            if (rank != null) {
-                ranksCol.add(rank);
-            }
-        }
-        return ranksCol;
-    }
-
-    @Override
-    public String toString() {
-        return "[" + crtRank + ",\n " + ranksMap + "\n]";
-    }
-}

[asterixdb] 02/03: [NO ISSUE] Rename tokenizer factories

Posted by im...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

imaxon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit 934ce9d903d36990d22febd69e08a81d0c8e40b2
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Tue Apr 14 12:17:13 2020 -0700

    [NO ISSUE] Rename tokenizer factories
    
    Rename the variables to make things clear. Previously, tokenizer
    factories were named tokenizer which could be confusing.
    
    Change-Id: Ie72d420e18509489d0fc0f9c98b162202a62be55
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5804
    Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ian Maxon <im...@uci.edu>
---
 .../nontagged/BinaryTokenizerFactoryProvider.java  | 27 ++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/BinaryTokenizerFactoryProvider.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/BinaryTokenizerFactoryProvider.java
index 5ef7702..6333e92 100644
--- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/BinaryTokenizerFactoryProvider.java
+++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/formats/nontagged/BinaryTokenizerFactoryProvider.java
@@ -30,26 +30,29 @@ import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8Strin
 import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
 import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
 
+// ToDo: maybe we can make the constructor method of the tokenizers private so that tokenizers have to be generated via this provider
+// Currently, different call sites of tokenizers use **different parameters**, and this can be error-prone
+// A centralized provider can avoid the bugs due to different parameters.
 public class BinaryTokenizerFactoryProvider implements IBinaryTokenizerFactoryProvider {
 
     public static final BinaryTokenizerFactoryProvider INSTANCE = new BinaryTokenizerFactoryProvider();
 
-    private static final IBinaryTokenizerFactory aqlStringTokenizer =
+    private static final IBinaryTokenizerFactory stringTokenizerFactory =
             new DelimitedUTF8StringBinaryTokenizerFactory(true, true,
                     new UTF8WordTokenFactory(ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_INT32_TYPE_TAG));
 
-    private static final IBinaryTokenizerFactory aqlStringNoTypeTagTokenizer =
+    private static final IBinaryTokenizerFactory stringNoTypeTagTokenizerFactory =
             new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
                     new UTF8WordTokenFactory(ATypeTag.STRING.serialize(), ATypeTag.INTEGER.serialize()));
 
-    private static final IBinaryTokenizerFactory aqlHashingStringTokenizer =
+    private static final IBinaryTokenizerFactory stringHashingTokenizerFactory =
             new DelimitedUTF8StringBinaryTokenizerFactory(true, true, new HashedUTF8WordTokenFactory(
                     ATypeTag.SERIALIZED_INT32_TYPE_TAG, ATypeTag.SERIALIZED_INT32_TYPE_TAG));
 
-    private static final IBinaryTokenizerFactory orderedListTokenizer =
+    private static final IBinaryTokenizerFactory orderedListTokenizerFactory =
             new AOrderedListBinaryTokenizerFactory(new AListElementTokenFactory());
 
-    private static final IBinaryTokenizerFactory unorderedListTokenizer =
+    private static final IBinaryTokenizerFactory unorderedListTokenizerFactory =
             new AUnorderedListBinaryTokenizerFactory(new AListElementTokenFactory());
 
     @Override
@@ -58,16 +61,16 @@ public class BinaryTokenizerFactoryProvider implements IBinaryTokenizerFactoryPr
         switch (typeTag) {
             case STRING:
                 if (hashedTokens) {
-                    return aqlHashingStringTokenizer;
+                    return stringHashingTokenizerFactory;
                 } else if (!typeTageAlreadyRemoved) {
-                    return aqlStringTokenizer;
+                    return stringTokenizerFactory;
                 } else {
-                    return aqlStringNoTypeTagTokenizer;
+                    return stringNoTypeTagTokenizerFactory;
                 }
             case ARRAY:
-                return orderedListTokenizer;
+                return orderedListTokenizerFactory;
             case MULTISET:
-                return unorderedListTokenizer;
+                return unorderedListTokenizerFactory;
             default:
                 return null;
         }
@@ -86,9 +89,9 @@ public class BinaryTokenizerFactoryProvider implements IBinaryTokenizerFactoryPr
                                     ATypeTag.SERIALIZED_INT32_TYPE_TAG));
                 }
             case ARRAY:
-                return orderedListTokenizer;
+                return orderedListTokenizerFactory;
             case MULTISET:
-                return unorderedListTokenizer;
+                return unorderedListTokenizerFactory;
             default:
                 return null;
         }

[asterixdb] 01/03: [ASTERIXDB-2709][COMP] Fix registration of st_union aggregates

Posted by im...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

imaxon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git

commit 3d0897b6532b276d20fb7fc255f91de58002d7b8
Author: Dmitry Lychagin <dm...@couchbase.com>
AuthorDate: Fri Apr 10 11:20:49 2020 -0700

    [ASTERIXDB-2709][COMP] Fix registration of st_union aggregates
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    - Fix registration of st_union() aggregate function
      in BuiltinFunctions
    
    Change-Id: Iff0e141a5874b87916bd07335594fb6350ed93c7
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5704
    Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
    Reviewed-by: Ali Alsuliman <al...@gmail.com>
---
 .../two-geometries/aggregate.17.query.sqlpp        | 23 ++++++++++++++++++
 .../results/geojson/two-geometries/result.17.adm   |  1 +
 .../asterix/om/functions/BuiltinFunctions.java     | 28 ++++++++++++++++++----
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/geojson/two-geometries/aggregate.17.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/geojson/two-geometries/aggregate.17.query.sqlpp
new file mode 100644
index 0000000..578ba95
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/geojson/two-geometries/aggregate.17.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use GeoJSON1;
+
+strict_st_union((
+  SELECT VALUE gbu FROM [st_make_point(1.0,1.0),st_make_point(1.0,2.0)] as gbu
+));
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/geojson/two-geometries/result.17.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/geojson/two-geometries/result.17.adm
new file mode 100644
index 0000000..5123e09
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/geojson/two-geometries/result.17.adm
@@ -0,0 +1 @@
+{"type":"MultiPoint","coordinates":[[1,1],[1,2]],"crs":{"type":"name","properties":{"name":"EPSG:4326"}}}
\ No newline at end of file
diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/functions/BuiltinFunctions.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/functions/BuiltinFunctions.java
index 3570aff..19c1b51 100644
--- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/functions/BuiltinFunctions.java
+++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/functions/BuiltinFunctions.java
@@ -1326,13 +1326,22 @@ public class BuiltinFunctions {
     public static final FunctionIdentifier ST_LENGTH =
             new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-length", 1);
     public static final FunctionIdentifier SCALAR_ST_UNION_AGG =
-            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-union", 1);
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st_union", 1);
     public static final FunctionIdentifier SCALAR_ST_UNION_AGG_DISTINCT =
-            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-union-distinct", 1);
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st_union-distinct", 1);
     public static final FunctionIdentifier ST_UNION_AGG =
-            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-union-agg", 1);
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "agg-st_union", 1);
     public static final FunctionIdentifier ST_UNION_AGG_DISTINCT =
-            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-union-agg-distinct", 1);
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "agg-st_union-distinct", 1);
+    public static final FunctionIdentifier SCALAR_ST_UNION_SQL_AGG =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "sql-st_union", 1);
+    public static final FunctionIdentifier SCALAR_ST_UNION_SQL_AGG_DISTINCT =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "sql-st_union-distinct", 1);
+    public static final FunctionIdentifier ST_UNION_SQL_AGG =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "agg-sql-st_union", 1);
+    public static final FunctionIdentifier ST_UNION_SQL_AGG_DISTINCT =
+            new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "agg-sql-st_union-distinct", 1);
+
     public static final FunctionIdentifier ST_GEOM_FROM_TEXT =
             new FunctionIdentifier(FunctionConstants.ASTERIX_NS, "st-geom-from-text", 1);
     public static final FunctionIdentifier ST_GEOM_FROM_TEXT_SRID =
@@ -2207,7 +2216,10 @@ public class BuiltinFunctions {
         addFunction(ST_SYM_DIFFERENCE, AGeometryTypeComputer.INSTANCE, true);
         addFunction(SCALAR_ST_UNION_AGG, AGeometryTypeComputer.INSTANCE, true);
         addFunction(SCALAR_ST_UNION_AGG_DISTINCT, AGeometryTypeComputer.INSTANCE, true);
+        addFunction(SCALAR_ST_UNION_SQL_AGG, AGeometryTypeComputer.INSTANCE, true);
+        addFunction(SCALAR_ST_UNION_SQL_AGG_DISTINCT, AGeometryTypeComputer.INSTANCE, true);
         addPrivateFunction(ST_UNION_AGG, AGeometryTypeComputer.INSTANCE, true);
+        addPrivateFunction(ST_UNION_SQL_AGG, AGeometryTypeComputer.INSTANCE, true);
         addFunction(ST_POLYGONIZE, AGeometryTypeComputer.INSTANCE, true);
 
         // Binary functions
@@ -3011,6 +3023,14 @@ public class BuiltinFunctions {
         addScalarAgg(ST_UNION_AGG, SCALAR_ST_UNION_AGG);
         addDistinctAgg(ST_UNION_AGG_DISTINCT, ST_UNION_AGG);
         addScalarAgg(ST_UNION_AGG_DISTINCT, SCALAR_ST_UNION_AGG_DISTINCT);
+
+        addAgg(ST_UNION_SQL_AGG);
+        addLocalAgg(ST_UNION_SQL_AGG, ST_UNION_SQL_AGG);
+        addIntermediateAgg(ST_UNION_SQL_AGG, ST_UNION_SQL_AGG);
+        addGlobalAgg(ST_UNION_SQL_AGG, ST_UNION_SQL_AGG);
+        addScalarAgg(ST_UNION_SQL_AGG, SCALAR_ST_UNION_SQL_AGG);
+        addDistinctAgg(ST_UNION_SQL_AGG_DISTINCT, ST_UNION_SQL_AGG);
+        addScalarAgg(ST_UNION_SQL_AGG_DISTINCT, SCALAR_ST_UNION_SQL_AGG_DISTINCT);
     }
 
     interface BuiltinFunctionProperty {