You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@rya.apache.org by mi...@apache.org on 2015/12/07 13:05:06 UTC
[36/51] [partial] incubator-rya git commit: Cannot delete temp branch,
doc'd it.
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java
deleted file mode 100644
index f529569..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/AccumuloFreeTextIndexer.java
+++ /dev/null
@@ -1,611 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.getNodeIterator;
-import info.aduna.iteration.CloseableIteration;
-
-import java.io.IOException;
-import java.nio.charset.CharacterCodingException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import mvm.rya.accumulo.experimental.AbstractAccumuloIndexer;
-import mvm.rya.api.domain.RyaStatement;
-import mvm.rya.api.resolver.RyaToRdfConversions;
-import mvm.rya.indexing.FreeTextIndexer;
-import mvm.rya.indexing.StatementContraints;
-import mvm.rya.indexing.accumulo.ConfigUtils;
-import mvm.rya.indexing.accumulo.Md5Hash;
-import mvm.rya.indexing.accumulo.StatementSerializer;
-import mvm.rya.indexing.accumulo.freetext.iterators.BooleanTreeIterator;
-import mvm.rya.indexing.accumulo.freetext.query.ASTExpression;
-import mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils;
-import mvm.rya.indexing.accumulo.freetext.query.ASTSimpleNode;
-import mvm.rya.indexing.accumulo.freetext.query.ASTTerm;
-import mvm.rya.indexing.accumulo.freetext.query.ParseException;
-import mvm.rya.indexing.accumulo.freetext.query.QueryParser;
-import mvm.rya.indexing.accumulo.freetext.query.QueryParserTreeConstants;
-import mvm.rya.indexing.accumulo.freetext.query.SimpleNode;
-import mvm.rya.indexing.accumulo.freetext.query.TokenMgrError;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.BatchWriter;
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.client.MultiTableBatchWriter;
-import org.apache.accumulo.core.client.MutationsRejectedException;
-import org.apache.accumulo.core.client.Scanner;
-import org.apache.accumulo.core.client.TableExistsException;
-import org.apache.accumulo.core.client.TableNotFoundException;
-import org.apache.accumulo.core.client.admin.TableOperations;
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.data.Range;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.file.keyfunctor.ColumnFamilyFunctor;
-import org.apache.accumulo.core.iterators.user.IntersectingIterator;
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang.Validate;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.log4j.Logger;
-import org.openrdf.model.Literal;
-import org.openrdf.model.Statement;
-import org.openrdf.model.URI;
-import org.openrdf.query.QueryEvaluationException;
-
-import com.google.common.base.Charsets;
-
-/**
- * The {@link AccumuloFreeTextIndexer} stores and queries "free text" data from statements into tables in Accumulo. Specifically, this class
- * stores data into two different Accumulo Tables. This is the <b>document table</b> (default name: triplestore_text) and the <b>terms
- * table</b> (default name: triplestore_terms).
- * <p>
- * The document table stores the document (i.e. a triple statement), document properties, and the terms within the document. This is the
- * main table used for processing a text search by using document partitioned indexing. See {@link IntersectingIterator}.
- * <p>
- * For each document, the document table will store the following information:
- * <P>
- *
- * <pre>
- * Row (partition) | Column Family | Column Qualifier | Value
- * ================+================+==================+==========
- * shardID | d\x00 | documentHash | Document
- * shardID | s\x00Subject | documentHash | (empty)
- * shardID | p\x00Predicate | documentHash | (empty)
- * shardID | o\x00Object | documentHash | (empty)
- * shardID | c\x00Context | documentHash | (empty)
- * shardID | t\x00token | documentHash | (empty)
- * </pre>
- * <p>
- * Note: documentHash is a sha256 Hash of the Document's Content
- * <p>
- * The terms table is used for expanding wildcard search terms. For each token in the document table, the table sill store the following
- * information:
- *
- * <pre>
- * Row (partition) | CF/CQ/Value
- * ==================+=============
- * l\x00token | (empty)
- * r\x00Reversetoken | (empty)
- * </pre>
- * <p>
- * There are two prefixes in the table, "token list" (keys with an "l" prefix) and "reverse token list" (keys with a "r" prefix). This table
- * is uses the "token list" to expand foo* into terms like food, foot, and football. This table uses the "reverse token list" to expand *ar
- * into car, bar, and far.
- * <p>
- * Example: Given these three statements as inputs:
- *
- * <pre>
- * <uri:paul> rdfs:label "paul smith"@en <uri:graph1>
- * <uri:steve> rdfs:label "steven anthony miller"@en <uri:graph1>
- * <uri:steve> rdfs:label "steve miller"@en <uri:graph1>
- * </pre>
- * <p>
- * Here's what the tables would look like: (Note: the hashes aren't real, the rows are not sorted, and the partition ids will vary.)
- * <p>
- * Triplestore_text
- *
- * <pre>
- * Row (partition) | Column Family | Column Qualifier | Value
- * ================+=================================+==================+==========
- * 000000 | d\x00 | 08b3d233a | uri:graph1x00uri:paul\x00rdfs:label\x00"paul smith"@en
- * 000000 | s\x00uri:paul | 08b3d233a | (empty)
- * 000000 | p\x00rdfs:label | 08b3d233a | (empty)
- * 000000 | o\x00"paul smith"@en | 08b3d233a | (empty)
- * 000000 | c\x00uri:graph1 | 08b3d233a | (empty)
- * 000000 | t\x00paul | 08b3d233a | (empty)
- * 000000 | t\x00smith | 08b3d233a | (empty)
- *
- * 000000 | d\x00 | 3a575534b | uri:graph1x00uri:steve\x00rdfs:label\x00"steven anthony miller"@en
- * 000000 | s\x00uri:steve | 3a575534b | (empty)
- * 000000 | p\x00rdfs:label | 3a575534b | (empty)
- * 000000 | o\x00"steven anthony miller"@en | 3a575534b | (empty)
- * 000000 | c\x00uri:graph1 | 3a575534b | (empty)
- * 000000 | t\x00steven | 3a575534b | (empty)
- * 000000 | t\x00anthony | 3a575534b | (empty)
- * 000000 | t\x00miller | 3a575534b | (empty)
- *
- * 000001 | d\x00 | 7bf670d06 | uri:graph1x00uri:steve\x00rdfs:label\x00"steve miller"@en
- * 000001 | s\x00uri:steve | 7bf670d06 | (empty)
- * 000001 | p\x00rdfs:label | 7bf670d06 | (empty)
- * 000001 | o\x00"steve miller"@en | 7bf670d06 | (empty)
- * 000001 | c\x00uri:graph1 | 7bf670d06 | (empty)
- * 000001 | t\x00steve | 7bf670d06 | (empty)
- * 000001 | t\x00miller | 7bf670d06 | (empty)
- * </pre>
- * <p>
- * triplestore_terms
- * <p>
- *
- * <pre>
- * Row (partition) | CF/CQ/Value
- * ==================+=============
- * l\x00paul | (empty)
- * l\x00smith | (empty)
- * l\x00steven | (empty)
- * l\x00anthony | (empty)
- * l\x00miller | (empty)
- * l\x00steve | (empty)
- * r\x00luap | (empty)
- * r\x00htims | (empty)
- * r\x00nevets | (empty)
- * r\x00ynohtna | (empty)
- * r\x00rellim | (empty)
- * r\x00evets | (empty)
- *
- * <pre>
- */
-public class AccumuloFreeTextIndexer extends AbstractAccumuloIndexer implements FreeTextIndexer {
- private static final Logger logger = Logger.getLogger(AccumuloFreeTextIndexer.class);
-
- private static final byte[] EMPTY_BYTES = new byte[] {};
- private static final Text EMPTY_TEXT = new Text(EMPTY_BYTES);
- private static final Value EMPTY_VALUE = new Value(EMPTY_BYTES);
-
- private Tokenizer tokenizer;
-
- private BatchWriter docTableBw;
- private BatchWriter termTableBw;
- private MultiTableBatchWriter mtbw;
-
- private int queryTermLimit;
-
- private int docTableNumPartitions;
-
- private Set<URI> validPredicates;
-
- private Configuration conf;
-
- private boolean isInit = false;
-
-
- private void init() throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
- TableExistsException {
- String doctable = ConfigUtils.getFreeTextDocTablename(conf);
- String termtable = ConfigUtils.getFreeTextTermTablename(conf);
-
- docTableNumPartitions = ConfigUtils.getFreeTextDocNumPartitions(conf);
- int termTableNumPartitions = ConfigUtils.getFreeTextTermNumPartitions(conf);
-
- TableOperations tableOps = ConfigUtils.getConnector(conf).tableOperations();
-
- // Create term table partitions
- boolean createdTermTable = ConfigUtils.createTableIfNotExists(conf, termtable);
- if (createdTermTable && !ConfigUtils.useMockInstance(conf) && termTableNumPartitions > 0) {
- TreeSet<Text> splits = new TreeSet<Text>();
-
- // split on the "Term List" and "Reverse Term list" boundary
- splits.add(new Text(ColumnPrefixes.getRevTermListColFam("")));
-
- // Symmetrically split the "Term List" and "Reverse Term list"
- int numSubpartitions = ((termTableNumPartitions - 1) / 2);
- if (numSubpartitions > 0) {
- int step = (26 / numSubpartitions);
- for (int i = 0; i < numSubpartitions; i++) {
- String nextChar = String.valueOf((char) ('a' + (step * i)));
- splits.add(new Text(ColumnPrefixes.getTermListColFam(nextChar)));
- splits.add(new Text(ColumnPrefixes.getRevTermListColFam(nextChar)));
- }
- }
- tableOps.addSplits(termtable, splits);
- }
-
- // Create document (text) table partitions
- boolean createdDocTable = ConfigUtils.createTableIfNotExists(conf, doctable);
- if (createdDocTable && !ConfigUtils.useMockInstance(conf)) {
- TreeSet<Text> splits = new TreeSet<Text>();
- for (int i = 0; i < docTableNumPartitions; i++) {
- splits.add(genPartition(i, docTableNumPartitions));
- }
- tableOps.addSplits(doctable, splits);
-
- // Add a tablet level Bloom filter for the Column Family.
- // This will allow us to quickly determine if a term is contained in a tablet.
- tableOps.setProperty(doctable, "table.bloom.key.functor", ColumnFamilyFunctor.class.getCanonicalName());
- tableOps.setProperty(doctable, "table.bloom.enabled", Boolean.TRUE.toString());
- }
-
- mtbw = ConfigUtils.createMultitableBatchWriter(conf);
-
- docTableBw = mtbw.getBatchWriter(doctable);
- termTableBw = mtbw.getBatchWriter(termtable);
-
- tokenizer = ConfigUtils.getFreeTextTokenizer(conf);
- validPredicates = ConfigUtils.getFreeTextPredicates(conf);
-
- queryTermLimit = ConfigUtils.getFreeTextTermLimit(conf);
- }
-
-
- //initialization occurs in setConf because index is created using reflection
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- if (!isInit) {
- try {
- init();
- isInit = true;
- } catch (AccumuloException e) {
- logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e);
- throw new RuntimeException(e);
- } catch (AccumuloSecurityException e) {
- logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e);
- throw new RuntimeException(e);
- } catch (TableNotFoundException e) {
- logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e);
- throw new RuntimeException(e);
- } catch (TableExistsException e) {
- logger.warn("Unable to initialize index. Throwing Runtime Exception. ", e);
- throw new RuntimeException(e);
- }
- }
- }
-
- @Override
- public Configuration getConf() {
- return this.conf;
- }
-
-
- private void storeStatement(Statement statement) throws IOException {
- // if the predicate list is empty, accept all predicates.
- // Otherwise, make sure the predicate is on the "valid" list
- boolean isValidPredicate = validPredicates.isEmpty() || validPredicates.contains(statement.getPredicate());
-
- if (isValidPredicate && (statement.getObject() instanceof Literal)) {
-
- // Get the tokens
- String text = statement.getObject().stringValue().toLowerCase();
- SortedSet<String> tokens = tokenizer.tokenize(text);
-
- if (!tokens.isEmpty()) {
- // Get Document Data
- String docContent = StatementSerializer.writeStatement(statement);
-
- String docId = Md5Hash.md5Base64(docContent);
-
- // Setup partition
- Text partition = genPartition(docContent.hashCode(), docTableNumPartitions);
-
- Mutation docTableMut = new Mutation(partition);
- List<Mutation> termTableMutations = new ArrayList<Mutation>();
-
- Text docIdText = new Text(docId);
-
- // Store the Document Data
- docTableMut.put(ColumnPrefixes.DOCS_CF_PREFIX, docIdText, new Value(docContent.getBytes(Charsets.UTF_8)));
-
- // index the statement parts
- docTableMut.put(ColumnPrefixes.getSubjColFam(statement), docIdText, EMPTY_VALUE);
- docTableMut.put(ColumnPrefixes.getPredColFam(statement), docIdText, EMPTY_VALUE);
- docTableMut.put(ColumnPrefixes.getObjColFam(statement), docIdText, EMPTY_VALUE);
- docTableMut.put(ColumnPrefixes.getContextColFam(statement), docIdText, EMPTY_VALUE);
-
- // index the statement terms
- for (String token : tokens) {
- // tie the token to the document
- docTableMut.put(ColumnPrefixes.getTermColFam(token), docIdText, EMPTY_VALUE);
-
- // store the term in the term table (useful for wildcard searches)
- termTableMutations.add(createEmptyPutMutation(ColumnPrefixes.getTermListColFam(token)));
- termTableMutations.add(createEmptyPutMutation(ColumnPrefixes.getRevTermListColFam(token)));
- }
-
- // write the mutations
- try {
- docTableBw.addMutation(docTableMut);
- termTableBw.addMutations(termTableMutations);
- } catch (MutationsRejectedException e) {
- logger.error("error adding mutation", e);
- throw new IOException(e);
- }
-
- }
-
- }
- }
-
- @Override
- public void storeStatement(RyaStatement statement) throws IOException {
- storeStatement(RyaToRdfConversions.convertStatement(statement));
- }
-
- private static Mutation createEmptyPutMutation(Text row) {
- Mutation m = new Mutation(row);
- m.put(EMPTY_TEXT, EMPTY_TEXT, EMPTY_VALUE);
- return m;
- }
-
- private static Text genPartition(int partition, int numParitions) {
- int length = Integer.toString(numParitions).length();
- return new Text(String.format("%0" + length + "d", Math.abs(partition % numParitions)));
- }
-
- @Override
- public Set<URI> getIndexablePredicates() {
- return validPredicates;
- }
-
- /** {@inheritDoc} */
- @Override
- public void flush() throws IOException {
- try {
- mtbw.flush();
- } catch (MutationsRejectedException e) {
- logger.error("error flushing the batch writer", e);
- throw new IOException(e);
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public void close() throws IOException {
- try {
- mtbw.close();
- } catch (MutationsRejectedException e) {
- logger.error("error closing the batch writer", e);
- throw new IOException(e);
- }
- }
-
- private Set<String> unrollWildcard(String string, boolean reverse) throws IOException {
- Scanner termTableScan = getScanner(ConfigUtils.getFreeTextTermTablename(conf));
-
- Set<String> unrolledTerms = new HashSet<String>();
-
- Text queryTerm;
- if (reverse) {
- String t = StringUtils.removeStart(string, "*").toLowerCase();
- queryTerm = ColumnPrefixes.getRevTermListColFam(t);
- } else {
- String t = StringUtils.removeEnd(string, "*").toLowerCase();
- queryTerm = ColumnPrefixes.getTermListColFam(t);
- }
-
- // perform query and read results
- termTableScan.setRange(Range.prefix(queryTerm));
-
- for (Entry<Key, Value> e : termTableScan) {
- String term = ColumnPrefixes.removePrefix(e.getKey().getRow()).toString();
- if (reverse) {
- unrolledTerms.add(StringUtils.reverse(term));
- } else {
- unrolledTerms.add(term);
- }
- }
-
- if (unrolledTerms.isEmpty()) {
- // put in a placeholder term that will never be in the index.
- unrolledTerms.add("\1\1\1");
- }
-
- return unrolledTerms;
- }
-
- private void unrollWildcards(SimpleNode node) throws IOException {
- if (node instanceof ASTExpression || node instanceof ASTSimpleNode) {
- for (SimpleNode n : getNodeIterator(node)) {
- unrollWildcards(n);
- }
- } else if (node instanceof ASTTerm) {
- ASTTerm term = (ASTTerm) node;
- boolean isWildTerm = term.getType().equals(ASTTerm.WILDTERM);
- boolean isPreWildTerm = term.getType().equals(ASTTerm.PREFIXTERM);
- if (isWildTerm || isPreWildTerm) {
- Set<String> unrolledTerms = unrollWildcard(term.getTerm(), isPreWildTerm);
-
- // create a new expression
- ASTExpression newExpression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION);
- newExpression.setType(ASTExpression.OR);
- newExpression.setNotFlag(term.isNotFlag());
-
- for (String unrolledTerm : unrolledTerms) {
- ASTTerm t = new ASTTerm(QueryParserTreeConstants.JJTTERM);
- t.setNotFlag(false);
- t.setTerm(unrolledTerm);
- t.setType(ASTTerm.TERM);
- ASTNodeUtils.pushChild(newExpression, t);
- }
-
- // replace "term" node with "expression" node in "term" node parent
- SimpleNode parent = (SimpleNode) term.jjtGetParent();
- int index = ASTNodeUtils.getChildIndex(parent, term);
-
- Validate.isTrue(index >= 0, "child not found in parent");
-
- parent.jjtAddChild(newExpression, index);
- }
-
- } else {
- throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName());
- }
- }
-
- private Scanner getScanner(String tablename) throws IOException {
- try {
- return ConfigUtils.createScanner(tablename, conf);
- } catch (AccumuloException e) {
- logger.error("Error connecting to " + tablename);
- throw new IOException(e);
- } catch (AccumuloSecurityException e) {
- logger.error("Error connecting to " + tablename);
- throw new IOException(e);
- } catch (TableNotFoundException e) {
- logger.error("Error connecting to " + tablename);
- throw new IOException(e);
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public CloseableIteration<Statement, QueryEvaluationException> queryText(String query, StatementContraints contraints)
- throws IOException {
- Scanner docTableScan = getScanner(ConfigUtils.getFreeTextDocTablename(conf));
-
- // test the query to see if it's parses correctly.
- SimpleNode root = parseQuery(query);
-
- // unroll any wildcard nodes before it goes to the server
- unrollWildcards(root);
-
- String unrolledQuery = ASTNodeUtils.serializeExpression(root);
-
- // Add S P O C constraints to query
- StringBuilder constrainedQuery = new StringBuilder("(" + unrolledQuery + ")");
-
- if (contraints.hasSubject()) {
- constrainedQuery.append(" AND ");
- constrainedQuery.append(ColumnPrefixes.getSubjColFam(contraints.getSubject().toString()).toString());
- }
- if (contraints.hasContext()) {
- constrainedQuery.append(" AND ");
- constrainedQuery.append(ColumnPrefixes.getContextColFam(contraints.getContext().toString()).toString());
- }
- if (contraints.hasPredicates()) {
- constrainedQuery.append(" AND (");
- List<String> predicates = new ArrayList<String>();
- for (URI u : contraints.getPredicates()) {
- predicates.add(ColumnPrefixes.getPredColFam(u.stringValue()).toString());
- }
- constrainedQuery.append(StringUtils.join(predicates, " OR "));
- constrainedQuery.append(")");
- }
-
- // Verify that the query is a reasonable size
- root = parseQuery(constrainedQuery.toString());
- int termCount = ASTNodeUtils.termCount(root);
-
- if (termCount > queryTermLimit) {
- throw new IOException("Query contains too many terms. Term limit: " + queryTermLimit + ". Term Count: " + termCount);
- }
-
- // perform query
- docTableScan.clearScanIterators();
- docTableScan.clearColumns();
-
- int iteratorPriority = 20;
- String iteratorName = "booleanTree";
- IteratorSetting ii = new IteratorSetting(iteratorPriority, iteratorName, BooleanTreeIterator.class);
- BooleanTreeIterator.setQuery(ii, constrainedQuery.toString());
- docTableScan.addScanIterator(ii);
- docTableScan.setRange(new Range());
-
- return getIteratorWrapper(docTableScan);
- }
-
- private static CloseableIteration<Statement, QueryEvaluationException> getIteratorWrapper(final Scanner s) {
-
- final Iterator<Entry<Key, Value>> i = s.iterator();
-
- return new CloseableIteration<Statement, QueryEvaluationException>() {
- @Override
- public boolean hasNext() {
- return i.hasNext();
- }
-
- @Override
- public Statement next() throws QueryEvaluationException {
- Entry<Key, Value> entry = i.next();
- Value v = entry.getValue();
- try {
- String dataString = Text.decode(v.get(), 0, v.getSize());
- Statement s = StatementSerializer.readStatement(dataString);
- return s;
- } catch (CharacterCodingException e) {
- logger.error("Error decoding value", e);
- throw new QueryEvaluationException(e);
- } catch (IOException e) {
- logger.error("Error deserializing statement", e);
- throw new QueryEvaluationException(e);
- }
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Remove not implemented");
- }
-
- @Override
- public void close() throws QueryEvaluationException {
- s.close();
- }
- };
- }
-
- /**
- * Simple adapter that parses the query using {@link QueryParser}. Note: any checked exceptions thrown by {@link QueryParser} are
- * re-thrown as {@link IOException}s.
- *
- * @param query
- * @return
- * @throws IOException
- */
- private static SimpleNode parseQuery(String query) throws IOException {
- SimpleNode root = null;
- try {
- root = QueryParser.parse(query);
- } catch (ParseException e) {
- logger.error("Parser Exception on Client Side. Query: " + query, e);
- throw new IOException(e);
- } catch (TokenMgrError e) {
- logger.error("Token Manager Exception on Client Side. Query: " + query, e);
- throw new IOException(e);
- }
- return root;
- }
-
-
- @Override
- public String getTableName() {
- return ConfigUtils.getFreeTextDocTablename(conf);
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java
deleted file mode 100644
index 31666c9..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/ColumnPrefixes.java
+++ /dev/null
@@ -1,120 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-
-import mvm.rya.indexing.accumulo.StatementSerializer;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.io.Text;
-import org.openrdf.model.Statement;
-
-/**
- * Row ID: shardId
- * <p>
- * CF: CF Prefix + Term
- */
-public class ColumnPrefixes {
- public static final Text DOCS_CF_PREFIX = new Text("d\0");
- public static final Text TERM_CF_PREFIX = new Text("t\0");
- public static final Text TERM_LIST_CF_PREFIX = new Text("l\0");
- public static final Text REVERSE_TERM_LIST_CF_PREFIX = new Text("r\0");
-
- public static final Text SUBJECT_CF_PREFIX = new Text("s\0");
- public static final Text PREDICATE_CF_PREFIX = new Text("p\0");
- public static final Text OBJECT_CF_PREFIX = new Text("o\0");
- public static final Text CONTEXT_CF_PREFIX = new Text("c\0");
-
- private static Text concat(Text prefix, String str) {
- Text temp = new Text(prefix);
-
- try {
- ByteBuffer buffer = Text.encode(str, false);
- temp.append(buffer.array(), 0, buffer.limit());
- } catch (CharacterCodingException cce) {
- throw new IllegalArgumentException(cce);
- }
-
- return temp;
- }
-
- public static Text getTermColFam(String term) {
- return concat(TERM_CF_PREFIX, term);
- }
-
- public static Text getTermListColFam(String term) {
- return concat(TERM_LIST_CF_PREFIX, term);
- }
-
- public static Text getRevTermListColFam(String term) {
- return concat(REVERSE_TERM_LIST_CF_PREFIX, StringUtils.reverse(term));
- }
-
- public static Text getDocColFam(String term) {
- return concat(DOCS_CF_PREFIX, term);
- }
-
- public static Text getSubjColFam(String term) {
- return concat(SUBJECT_CF_PREFIX, term);
- }
-
- public static Text getSubjColFam(Statement statement) {
- String subj = StatementSerializer.writeSubject(statement);
- return getSubjColFam(subj);
- }
-
- public static Text getPredColFam(String term) {
- return concat(PREDICATE_CF_PREFIX, term);
- }
-
- public static Text getPredColFam(Statement statement) {
- String pred = StatementSerializer.writePredicate(statement);
- return getPredColFam(pred);
- }
-
- public static Text getObjColFam(String term) {
- return concat(OBJECT_CF_PREFIX, term);
- }
-
- public static Text getObjColFam(Statement statement) {
- String obj = StatementSerializer.writeObject(statement);
- return getObjColFam(obj);
- }
-
- public static Text getContextColFam(String term) {
- return concat(CONTEXT_CF_PREFIX, term);
- }
-
- public static Text getContextColFam(Statement statement) {
- String cont = StatementSerializer.writeContext(statement);
- return getContextColFam(cont);
- }
-
- public static Text removePrefix(Text termWithPrefix) {
- Text temp = new Text();
- temp.set(termWithPrefix.getBytes(), 2, termWithPrefix.getLength() - 2);
- return temp;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java
deleted file mode 100644
index 471870b..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/FreeTextTupleSet.java
+++ /dev/null
@@ -1,160 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-import info.aduna.iteration.CloseableIteration;
-
-import java.io.IOException;
-import java.util.Set;
-
-import mvm.rya.indexing.FreeTextIndexer;
-import mvm.rya.indexing.IndexingExpr;
-import mvm.rya.indexing.IteratorFactory;
-import mvm.rya.indexing.SearchFunction;
-import mvm.rya.indexing.StatementContraints;
-import mvm.rya.indexing.external.tupleSet.ExternalTupleSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.openrdf.model.Statement;
-import org.openrdf.model.URI;
-import org.openrdf.query.BindingSet;
-import org.openrdf.query.QueryEvaluationException;
-import org.openrdf.query.algebra.QueryModelVisitor;
-
-import com.google.common.base.Joiner;
-
-
-//Indexing Node for freetext expressions to be inserted into execution plan
-//to delegate freetext portion of query to free text index
-public class FreeTextTupleSet extends ExternalTupleSet {
-
- private Configuration conf;
- private FreeTextIndexer freeTextIndexer;
- private IndexingExpr filterInfo;
-
-
- public FreeTextTupleSet(IndexingExpr filterInfo, FreeTextIndexer freeTextIndexer) {
- this.filterInfo = filterInfo;
- this.freeTextIndexer = freeTextIndexer;
- this.conf = freeTextIndexer.getConf();
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public Set<String> getBindingNames() {
- return filterInfo.getBindingNames();
- }
-
- /**
- * {@inheritDoc}
- * <p>
- * Note that we need a deep copy for everything that (during optimizations)
- * can be altered via {@link #visitChildren(QueryModelVisitor)}
- */
- public FreeTextTupleSet clone() {
- return new FreeTextTupleSet(filterInfo, freeTextIndexer);
- }
-
- @Override
- public double cardinality() {
- return 0.0; // No idea how the estimate cardinality here.
- }
-
-
-
-
- @Override
- public String getSignature() {
-
- return "(FreeTextTuple Projection) " + "variables: " + Joiner.on(", ").join(this.getBindingNames()).replaceAll("\\s+", " ");
- }
-
-
-
- @Override
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
- if (!(other instanceof FreeTextTupleSet)) {
- return false;
- }
-
- FreeTextTupleSet arg = (FreeTextTupleSet) other;
- return this.filterInfo.equals(arg.filterInfo);
- }
-
-
- @Override
- public int hashCode() {
- int result = 17;
- result = 31*result + filterInfo.hashCode();
-
- return result;
- }
-
-
-
- /**
- * Returns an iterator over the result set of the contained {@link IndexExpr}.
- * <p>
- * Should be thread-safe (concurrent invocation {@link OfflineIterable} this
- * method can be expected with some query evaluators.
- */
- @Override
- public CloseableIteration<BindingSet, QueryEvaluationException> evaluate(BindingSet bindings)
- throws QueryEvaluationException {
-
-
- URI funcURI = filterInfo.getFunction();
-
- SearchFunction searchFunction = new SearchFunction() {
-
- @Override
- public CloseableIteration<Statement, QueryEvaluationException> performSearch(String queryText,
- StatementContraints contraints) throws QueryEvaluationException {
- try {
- CloseableIteration<Statement, QueryEvaluationException> statements = freeTextIndexer.queryText(
- queryText, contraints);
- return statements;
- } catch (IOException e) {
- throw new QueryEvaluationException(e);
- }
- }
-
- @Override
- public String toString() {
- return "TEXT";
- };
- };
-
- if (filterInfo.getArguments().length > 1) {
- throw new IllegalArgumentException("Index functions do not support more than two arguments.");
- }
-
- String queryText = filterInfo.getArguments()[0].stringValue();
-
- return IteratorFactory.getIterator(filterInfo.getSpConstraint(), bindings, queryText, searchFunction);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java
deleted file mode 100644
index abda04a..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/LuceneTokenizer.java
+++ /dev/null
@@ -1,57 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
-
-/**
- * A {@link Tokenizer} that delegates to Lucene functions
- */
-public class LuceneTokenizer implements Tokenizer {
- private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
-
- @Override
- public SortedSet<String> tokenize(String string) {
- SortedSet<String> set = new TreeSet<String>();
- try {
- TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
- stream.reset();
- while (stream.incrementToken()) {
- set.add(stream.getAttribute(CharTermAttribute.class).toString());
- }
- } catch (IOException e) {
- // not thrown b/c we're using a string reader...
- throw new RuntimeException(e);
- }
-
- return set;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java
deleted file mode 100644
index e98e676..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/SimpleTokenizer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-/**
- * A {@link Tokenizer} that splits on whitespace.
- */
-public class SimpleTokenizer implements Tokenizer {
-
- @Override
- public SortedSet<String> tokenize(String sting) {
- SortedSet<String> set = new TreeSet<String>();
- for (String token : sting.split("\\s+")) {
- String t = token.trim().toLowerCase();
- if (!t.isEmpty()) {
- set.add(t);
- }
- }
- return set;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java
deleted file mode 100644
index 24b40cd..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/Tokenizer.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.util.SortedSet;
-
-/**
- * A utility that splits a string into tokens.
- */
-public interface Tokenizer {
- public SortedSet<String> tokenize(String sting);
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java
deleted file mode 100644
index 355fe14..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/AndingIterator.java
+++ /dev/null
@@ -1,563 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext.iterators;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Map;
-
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.data.ArrayByteSequence;
-import org.apache.accumulo.core.data.ByteSequence;
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.PartialKey;
-import org.apache.accumulo.core.data.Range;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.iterators.IteratorEnvironment;
-import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
-import org.apache.accumulo.core.iterators.user.IntersectingIterator;
-import org.apache.accumulo.core.util.TextUtil;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.hadoop.io.Text;
-import org.apache.log4j.Logger;
-
-/**
- * Adapted from {@link IntersectingIterator} with very slight modifications. Specifically, the comparator on the TermSource internal class was
- * modified to handle exhausted iterators and multiple rows per tablet server.
- */
-public class AndingIterator implements SortedKeyValueIterator<Key, Value> {
-
- protected Text nullText = new Text();
-
- protected Text getPartition(Key key) {
- return key.getRow();
- }
-
- protected Text getTerm(Key key) {
- return key.getColumnFamily();
- }
-
- protected Text getDocID(Key key) {
- return key.getColumnQualifier();
- }
-
- protected Key buildKey(Text partition, Text term) {
- return new Key(partition, (term == null) ? nullText : term);
- }
-
- protected Key buildKey(Text partition, Text term, Text docID) {
- return new Key(partition, (term == null) ? nullText : term, docID);
- }
-
- protected Key buildFollowingPartitionKey(Key key) {
- return key.followingKey(PartialKey.ROW);
- }
-
- protected static final Logger log = Logger.getLogger(AndingIterator.class);
-
- protected static class TermSource {
- public SortedKeyValueIterator<Key, Value> iter;
- public Text term;
- public Collection<ByteSequence> seekColfams;
- public boolean notFlag;
-
- public TermSource(TermSource other) {
- this.iter = other.iter;
- this.term = other.term;
- this.notFlag = other.notFlag;
- this.seekColfams = other.seekColfams;
- }
-
- public TermSource(SortedKeyValueIterator<Key, Value> iter, Text term) {
- this(iter, term, false);
- }
-
- public TermSource(SortedKeyValueIterator<Key, Value> iter, Text term, boolean notFlag) {
- this.iter = iter;
- this.term = term;
- this.notFlag = notFlag;
- // The desired column families for this source is the term itself
-
- // handle the case where the term is null.
- if (term == null) {
- this.seekColfams = Collections.<ByteSequence> emptyList();
- } else {
- this.seekColfams = Collections.<ByteSequence> singletonList(new ArrayByteSequence(term.getBytes(), 0, term.getLength()));
- }
- }
-
- public String getTermString() {
- return (this.term == null) ? new String("Iterator") : this.term.toString();
- }
- }
-
- TermSource[] sources;
- int sourcesCount = 0;
-
- Range overallRange;
-
- // query-time settings
- protected Text currentPartition = null;
- protected Text currentDocID = new Text(emptyByteArray);
- static final byte[] emptyByteArray = new byte[0];
-
- protected Key topKey = null;
- protected Value value = new Value(emptyByteArray);
-
- public AndingIterator() {
- }
-
- @Override
- public SortedKeyValueIterator<Key, Value> deepCopy(IteratorEnvironment env) {
- return new AndingIterator(this, env);
- }
-
- private AndingIterator(AndingIterator other, IteratorEnvironment env) {
- if (other.sources != null) {
- sourcesCount = other.sourcesCount;
- sources = new TermSource[sourcesCount];
- for (int i = 0; i < sourcesCount; i++) {
- sources[i] = new TermSource(other.sources[i].iter.deepCopy(env), other.sources[i].term);
- }
- }
- }
-
- @Override
- public Key getTopKey() {
- return topKey;
- }
-
- @Override
- public Value getTopValue() {
- // we don't really care about values
- return value;
- }
-
- @Override
- public boolean hasTop() {
- return currentPartition != null;
- }
-
- // precondition: currentRow is not null
- private boolean seekOneSource(int sourceID) throws IOException {
- // find the next key in the appropriate column family that is at or beyond the cursor (currentRow, currentCQ)
- // advance the cursor if this source goes beyond it
- // return whether we advanced the cursor
-
- // within this loop progress must be made in one of the following forms:
- // - currentRow or currentCQ must be increased
- // - the given source must advance its iterator
- // this loop will end when any of the following criteria are met
- // - the iterator for the given source is pointing to the key (currentRow, columnFamilies[sourceID], currentCQ)
- // - the given source is out of data and currentRow is set to null
- // - the given source has advanced beyond the endRow and currentRow is set to null
- boolean advancedCursor = false;
-
- if (sources[sourceID].notFlag) {
- while (true) {
- if (sources[sourceID].iter.hasTop() == false) {
- // an empty column that you are negating is a valid condition
- break;
- }
- // check if we're past the end key
- int endCompare = -1;
- // we should compare the row to the end of the range
- if (overallRange.getEndKey() != null) {
- endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow());
- if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) {
- // an empty column that you are negating is a valid condition
- break;
- }
- }
- int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey()));
- // check if this source is already at or beyond currentRow
- // if not, then seek to at least the current row
-
- if (partitionCompare > 0) {
- // seek to at least the currentRow
- Key seekKey = buildKey(currentPartition, sources[sourceID].term);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // check if this source has gone beyond currentRow
- // if so, this is a valid condition for negation
- if (partitionCompare < 0) {
- break;
- }
- // we have verified that the current source is positioned in currentRow
- // now we must make sure we're in the right columnFamily in the current row
- // Note: Iterators are auto-magically set to the correct columnFamily
- if (sources[sourceID].term != null) {
- int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey()));
- // check if this source is already on the right columnFamily
- // if not, then seek forwards to the right columnFamily
- if (termCompare > 0) {
- Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // check if this source is beyond the right columnFamily
- // if so, then this is a valid condition for negating
- if (termCompare < 0) {
- break;
- }
- }
-
- // we have verified that we are in currentRow and the correct column family
- // make sure we are at or beyond columnQualifier
- Text docID = getDocID(sources[sourceID].iter.getTopKey());
- int docIDCompare = currentDocID.compareTo(docID);
- // If we are past the target, this is a valid result
- if (docIDCompare < 0) {
- break;
- }
- // if this source is not yet at the currentCQ then advance in this source
- if (docIDCompare > 0) {
- // seek forwards
- Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // if we are equal to the target, this is an invalid result.
- // Force the entire process to go to the next row.
- // We are advancing column 0 because we forced that column to not contain a !
- // when we did the init()
- if (docIDCompare == 0) {
- sources[0].iter.next();
- advancedCursor = true;
- break;
- }
- }
- } else {
- while (true) {
- if (sources[sourceID].iter.hasTop() == false) {
- currentPartition = null;
- // setting currentRow to null counts as advancing the cursor
- return true;
- }
- // check if we're past the end key
- int endCompare = -1;
- // we should compare the row to the end of the range
-
- if (overallRange.getEndKey() != null) {
- endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow());
- if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) {
- currentPartition = null;
- // setting currentRow to null counts as advancing the cursor
- return true;
- }
- }
- int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey()));
- // check if this source is already at or beyond currentRow
- // if not, then seek to at least the current row
- if (partitionCompare > 0) {
- // seek to at least the currentRow
- Key seekKey = buildKey(currentPartition, sources[sourceID].term);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // check if this source has gone beyond currentRow
- // if so, advance currentRow
- if (partitionCompare < 0) {
- currentPartition.set(getPartition(sources[sourceID].iter.getTopKey()));
- currentDocID.set(emptyByteArray);
- advancedCursor = true;
- continue;
- }
- // we have verified that the current source is positioned in currentRow
- // now we must make sure we're in the right columnFamily in the current row
- // Note: Iterators are auto-magically set to the correct columnFamily
-
- if (sources[sourceID].term != null) {
- int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey()));
- // check if this source is already on the right columnFamily
- // if not, then seek forwards to the right columnFamily
- if (termCompare > 0) {
- Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // check if this source is beyond the right columnFamily
- // if so, then seek to the next row
- if (termCompare < 0) {
- // we're out of entries in the current row, so seek to the next one
- // byte[] currentRowBytes = currentRow.getBytes();
- // byte[] nextRow = new byte[currentRowBytes.length + 1];
- // System.arraycopy(currentRowBytes, 0, nextRow, 0, currentRowBytes.length);
- // nextRow[currentRowBytes.length] = (byte)0;
- // // we should reuse text objects here
- // sources[sourceID].seek(new Key(new Text(nextRow),columnFamilies[sourceID]));
- if (endCompare == 0) {
- // we're done
- currentPartition = null;
- // setting currentRow to null counts as advancing the cursor
- return true;
- }
- Key seekKey = buildFollowingPartitionKey(sources[sourceID].iter.getTopKey());
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- }
- // we have verified that we are in currentRow and the correct column family
- // make sure we are at or beyond columnQualifier
- Text docID = getDocID(sources[sourceID].iter.getTopKey());
- int docIDCompare = currentDocID.compareTo(docID);
- // if this source has advanced beyond the current column qualifier then advance currentCQ and return true
- if (docIDCompare < 0) {
- currentDocID.set(docID);
- advancedCursor = true;
- break;
- }
- // if this source is not yet at the currentCQ then seek in this source
- if (docIDCompare > 0) {
- // seek forwards
- Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID);
- sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true);
- continue;
- }
- // this source is at the current row, in its column family, and at currentCQ
- break;
- }
- }
- return advancedCursor;
- }
-
- @Override
- public void next() throws IOException {
- if (currentPartition == null) {
- return;
- }
- // precondition: the current row is set up and the sources all have the same column qualifier
- // while we don't have a match, seek in the source with the smallest column qualifier
- sources[0].iter.next();
- advanceToIntersection();
- }
-
- protected void advanceToIntersection() throws IOException {
- boolean cursorChanged = true;
- while (cursorChanged) {
- // seek all of the sources to at least the highest seen column qualifier in the current row
- cursorChanged = false;
- for (int i = 0; i < sourcesCount; i++) {
- if (currentPartition == null) {
- topKey = null;
- return;
- }
- if (seekOneSource(i)) {
- cursorChanged = true;
- break;
- }
- }
- }
- topKey = buildKey(currentPartition, nullText, currentDocID);
- }
-
- public static String stringTopKey(SortedKeyValueIterator<Key, Value> iter) {
- if (iter.hasTop())
- return iter.getTopKey().toString();
- return "";
- }
-
- private static final String columnFamiliesOptionName = "columnFamilies";
- private static final String notFlagOptionName = "notFlag";
-
- /**
- * @param columns
- * @return encoded columns
- * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use
- * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}.
- */
- public static String encodeColumns(Text[] columns) {
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < columns.length; i++) {
- sb.append(new String(Base64.encodeBase64(TextUtil.getBytes(columns[i]))));
- sb.append('\n');
- }
- return sb.toString();
- }
-
- /**
- * @param flags
- * @return encoded flags
- * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use
- * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}.
- */
- public static String encodeBooleans(boolean[] flags) {
- byte[] bytes = new byte[flags.length];
- for (int i = 0; i < flags.length; i++) {
- if (flags[i])
- bytes[i] = 1;
- else
- bytes[i] = 0;
- }
- return new String(Base64.encodeBase64(bytes));
- }
-
- protected static Text[] decodeColumns(String columns) {
- String[] columnStrings = columns.split("\n");
- Text[] columnTexts = new Text[columnStrings.length];
- for (int i = 0; i < columnStrings.length; i++) {
- columnTexts[i] = new Text(Base64.decodeBase64(columnStrings[i].getBytes()));
- }
- return columnTexts;
- }
-
- /**
- * to be made protected
- *
- * @param flags
- * @return decoded flags
- * @deprecated since 1.4. To be made protected. Do not interact with flags string directly, just use
- * {@link #setColumnFamilies(IteratorSetting, Text[], boolean[])}.
- */
- public static boolean[] decodeBooleans(String flags) {
- // return null of there were no flags
- if (flags == null)
- return null;
-
- byte[] bytes = Base64.decodeBase64(flags.getBytes());
- boolean[] bFlags = new boolean[bytes.length];
- for (int i = 0; i < bytes.length; i++) {
- if (bytes[i] == 1)
- bFlags[i] = true;
- else
- bFlags[i] = false;
- }
- return bFlags;
- }
-
- @Override
- public void init(SortedKeyValueIterator<Key, Value> source, Map<String, String> options, IteratorEnvironment env) throws IOException {
- Text[] terms = decodeColumns(options.get(columnFamiliesOptionName));
- boolean[] notFlag = decodeBooleans(options.get(notFlagOptionName));
-
- if (terms.length < 2) {
- throw new IllegalArgumentException("IntersectionIterator requires two or more columns families");
- }
-
- // Scan the not flags.
- // There must be at least one term that isn't negated
- // And we are going to re-order such that the first term is not a ! term
- if (notFlag == null) {
- notFlag = new boolean[terms.length];
- for (int i = 0; i < terms.length; i++)
- notFlag[i] = false;
- }
- if (notFlag[0]) {
- for (int i = 1; i < notFlag.length; i++) {
- if (notFlag[i] == false) {
- Text swapFamily = new Text(terms[0]);
- terms[0].set(terms[i]);
- terms[i].set(swapFamily);
- notFlag[0] = false;
- notFlag[i] = true;
- break;
- }
- }
- if (notFlag[0]) {
- throw new IllegalArgumentException("IntersectionIterator requires at lest one column family without not");
- }
- }
-
- sources = new TermSource[terms.length];
- sources[0] = new TermSource(source, terms[0]);
- for (int i = 1; i < terms.length; i++) {
- sources[i] = new TermSource(source.deepCopy(env), terms[i], notFlag[i]);
- }
- sourcesCount = terms.length;
- }
-
- @Override
- public void seek(Range range, Collection<ByteSequence> seekColumnFamilies, boolean inclusive) throws IOException {
- overallRange = new Range(range);
- currentPartition = new Text();
- currentDocID.set(emptyByteArray);
-
- // seek each of the sources to the right column family within the row given by key
- for (int i = 0; i < sourcesCount; i++) {
- Key sourceKey;
- if (range.getStartKey() != null) {
- if (range.getStartKey().getColumnQualifier() != null) {
- sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term, range.getStartKey().getColumnQualifier());
- } else {
- sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term);
- }
- // Seek only to the term for this source as a column family
- sources[i].iter.seek(new Range(sourceKey, true, null, false), sources[i].seekColfams, true);
- } else {
- // Seek only to the term for this source as a column family
- sources[i].iter.seek(range, sources[i].seekColfams, true);
- }
- }
- advanceToIntersection();
- }
-
- public void addSource(SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env, Text term, boolean notFlag) {
- // Check if we have space for the added Source
- if (sources == null) {
- sources = new TermSource[1];
- } else {
- // allocate space for node, and copy current tree.
- // TODO: Should we change this to an ArrayList so that we can just add() ?
- TermSource[] localSources = new TermSource[sources.length + 1];
- int currSource = 0;
- for (TermSource myTerm : sources) {
- // TODO: Do I need to call new here? or can I just re-use the term?
- localSources[currSource] = new TermSource(myTerm);
- currSource++;
- }
- sources = localSources;
- }
- sources[sourcesCount] = new TermSource(source.deepCopy(env), term, notFlag);
- sourcesCount++;
- }
-
- /**
- * Encode the columns to be used when iterating.
- *
- * @param cfg
- * @param columns
- */
- public static void setColumnFamilies(IteratorSetting cfg, Text[] columns) {
- if (columns.length < 2)
- throw new IllegalArgumentException("Must supply at least two terms to intersect");
- cfg.addOption(AndingIterator.columnFamiliesOptionName, AndingIterator.encodeColumns(columns));
- }
-
- /**
- * Encode columns and NOT flags indicating which columns should be negated (docIDs will be excluded if matching negated columns, instead
- * of included).
- *
- * @param cfg
- * @param columns
- * @param notFlags
- */
- public static void setColumnFamilies(IteratorSetting cfg, Text[] columns, boolean[] notFlags) {
- if (columns.length < 2)
- throw new IllegalArgumentException("Must supply at least two terms to intersect");
- if (columns.length != notFlags.length)
- throw new IllegalArgumentException("columns and notFlags arrays must be the same length");
- setColumnFamilies(cfg, columns);
- cfg.addOption(AndingIterator.notFlagOptionName, AndingIterator.encodeBooleans(notFlags));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java
deleted file mode 100644
index a69b78a..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/iterators/BooleanTreeIterator.java
+++ /dev/null
@@ -1,322 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext.iterators;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.allChildrenAreNot;
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.findFirstNonNotChild;
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.getNodeIterator;
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.isNotFlag;
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.pushChild;
-import static mvm.rya.indexing.accumulo.freetext.query.ASTNodeUtils.swapChildren;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-import java.util.NoSuchElementException;
-
-import mvm.rya.indexing.accumulo.freetext.ColumnPrefixes;
-import mvm.rya.indexing.accumulo.freetext.query.ASTExpression;
-import mvm.rya.indexing.accumulo.freetext.query.ASTTerm;
-import mvm.rya.indexing.accumulo.freetext.query.ParseException;
-import mvm.rya.indexing.accumulo.freetext.query.QueryParser;
-import mvm.rya.indexing.accumulo.freetext.query.QueryParserTreeConstants;
-import mvm.rya.indexing.accumulo.freetext.query.SimpleNode;
-import mvm.rya.indexing.accumulo.freetext.query.TokenMgrError;
-
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.data.ByteSequence;
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.Range;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.iterators.IteratorEnvironment;
-import org.apache.accumulo.core.iterators.OptionDescriber;
-import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
-import org.apache.accumulo.core.iterators.system.MultiIterator;
-import org.apache.commons.lang.Validate;
-import org.apache.hadoop.io.Text;
-import org.apache.log4j.Logger;
-
-public class BooleanTreeIterator implements SortedKeyValueIterator<Key, Value>, OptionDescriber {
- private static Logger logger = Logger.getLogger(BooleanTreeIterator.class);
-
- private static String queryOptionName = "query";
-
- private SortedKeyValueIterator<Key, Value> iter;
- private SortedKeyValueIterator<Key, Value> docSource;
-
- @Override
- public void init(SortedKeyValueIterator<Key, Value> source, Map<String, String> options, IteratorEnvironment env) throws IOException {
-
- // pull out the query
- String query = options.get(queryOptionName);
-
- // create the parse tree
- SimpleNode root;
- try {
- root = QueryParser.parse(query);
- } catch (ParseException e) {
- // log and wrap in IOException
- logger.error("ParseException encountered while parsing: " + query, e);
- throw new IOException(e);
- } catch (TokenMgrError e) {
- // log and wrap in IOException
- logger.error("TokenMgrError encountered while parsing: " + query, e);
- throw new IOException(e);
- }
-
- docSource = source.deepCopy(env);
- iter = createIterator((SimpleNode) root.jjtGetChild(0), source, env);
- }
-
- private SortedKeyValueIterator<Key, Value> createIterator(SimpleNode root, SortedKeyValueIterator<Key, Value> source,
- IteratorEnvironment env) {
- // if the root is only a single term, wrap it in an expression node
- if (root instanceof ASTTerm) {
- ASTExpression expression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION);
- expression.setNotFlag(false);
- expression.setType(ASTExpression.AND);
-
- pushChild(expression, root);
- root.jjtSetParent(expression);
-
- root = expression;
- }
-
- // Pre-process the tree to compensate for iterator specific issues with certain topologies
- preProcessTree(root);
-
- // Build an iterator tree
- return createIteratorRecursive(root, source, env);
- }
-
- private SortedKeyValueIterator<Key, Value> createIteratorRecursive(SimpleNode node, SortedKeyValueIterator<Key, Value> source,
- IteratorEnvironment env) {
-
- Validate.isTrue(node instanceof ASTExpression, "node must be of type ASTExpression. Node is instance of "
- + node.getClass().getName());
-
- ASTExpression expression = (ASTExpression) node;
-
- if (expression.getType().equals(ASTExpression.AND)) {
- return getAndIterator(node, source, env);
- }
-
- if (expression.getType().equals(ASTExpression.OR)) {
- return getOrIterator(node, source, env);
- }
-
- throw new IllegalArgumentException("Expression is of unknown type: " + expression.getType());
-
- }
-
- private MultiIterator getOrIterator(SimpleNode node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) {
- List<SortedKeyValueIterator<Key, Value>> iters = new ArrayList<SortedKeyValueIterator<Key, Value>>();
-
- for (SimpleNode n : getNodeIterator(node)) {
- if (n instanceof ASTExpression) {
- iters.add(createIteratorRecursive(n, source, env));
- } else if (n instanceof ASTTerm) {
- iters.add(getSimpleAndingIterator((ASTTerm) n, source, env));
- } else {
- throw new IllegalArgumentException("Node is of unknown type: " + n.getClass().getName());
- }
- }
-
- return new MultiIterator(iters, new Range());
- }
-
- private AndingIterator getAndIterator(SimpleNode node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) {
-
- AndingIterator anding = new AndingIterator();
-
- for (SimpleNode n : getNodeIterator(node)) {
- boolean isNotFlag = isNotFlag(n);
- if (n instanceof ASTExpression) {
- anding.addSource(createIteratorRecursive(n, source, env), env, null, isNotFlag);
- } else if (n instanceof ASTTerm) {
- ASTTerm term = ((ASTTerm) n);
- anding.addSource(source, env, getTermColFam(term), isNotFlag);
- } else {
- throw new IllegalArgumentException("Node is of unknown type: " + n.getClass().getName());
- }
- }
-
- return anding;
- }
-
- private static Text getTermColFam(ASTTerm termnode) {
- String term = termnode.getTerm();
- if (term == null) {
- // if the term is null, then I want all of the documents
- return ColumnPrefixes.DOCS_CF_PREFIX;
- }
- if (term.contains("\0")) {
- // if the term is contain a null char, then it's already formated for a CF
- return new Text(term);
- }
-
- // otherwise, point to the term CF
- return ColumnPrefixes.getTermColFam(term.toLowerCase());
- }
-
- private AndingIterator getSimpleAndingIterator(ASTTerm node, SortedKeyValueIterator<Key, Value> source, IteratorEnvironment env) {
- Validate.isTrue(!node.isNotFlag(), "Simple Anding node must not have \"not\" flag set");
-
- AndingIterator anding = new AndingIterator();
- anding.addSource(source, env, getTermColFam(node), false);
- return anding;
- }
-
- /**
- * Handle "lonely nots" (i.e. expressions with only nots), "or" statements containing nots, and make sure that the first term in an
- * "and" statement is not a not. This is due to implementation specific limitations of the iterators.
- * <p>
- * For example:
- * <ul>
- * <li>lonely nots: (!a & !b) -> [all] & !a & !b</li>
- * <li>"or" nots: (!a | b) -> ( ([all] & !a) | b)</li>
- * <li>reorder "and" nots: (!a & b) -> ( b & !a )</li>
- * </ul>
- **/
- public static void preProcessTree(SimpleNode s) {
- for (SimpleNode child : getNodeIterator(s)) {
- preProcessTree(child);
- }
-
- if (s instanceof ASTExpression) {
- ASTExpression expression = (ASTExpression) s;
-
- if (expression.getType().equals(ASTExpression.AND)) {
- if (allChildrenAreNot(expression)) {
- // lonely nots: (!a & !b) -> [all] & !a & !b
- ASTTerm allDocsTerm = createAllDocTermNode();
- pushChild(expression, allDocsTerm);
- } else if (isNotFlag(expression.jjtGetChild(0))) {
- // reorder "and" nots: (!a & b) -> ( b & !a )
- int firstNonNotChild = findFirstNonNotChild(expression);
- swapChildren(expression, 0, firstNonNotChild);
- }
- }
-
- if (expression.getType().equals(ASTExpression.OR)) {
- for (int i = 0; i < expression.jjtGetNumChildren(); i++) {
- SimpleNode child = (SimpleNode) expression.jjtGetChild(i);
- if (isNotFlag(child)) {
- // "or" nots: (!a | b) -> ( ([all] & !a) | b)
- // create the new expression
- ASTExpression newExpression = new ASTExpression(QueryParserTreeConstants.JJTEXPRESSION);
- newExpression.setNotFlag(false);
- newExpression.setType(ASTExpression.AND);
- pushChild(newExpression, child);
- pushChild(newExpression, createAllDocTermNode());
-
- // tie the new expression to the old one
- newExpression.jjtSetParent(expression);
- expression.jjtAddChild(newExpression, i);
- }
- }
- }
- }
-
- }
-
- public static ASTTerm createAllDocTermNode() {
- ASTTerm t = new ASTTerm(QueryParserTreeConstants.JJTTERM);
- t.setNotFlag(false);
- t.setType(ASTTerm.TERM);
- // note: a "null" signifies "all docs" should be returned.
- t.setTerm(null);
- return t;
- }
-
- @Override
- public boolean hasTop() {
- return iter.hasTop();
- }
-
- @Override
- public void next() throws IOException {
- iter.next();
- if (iter.hasTop()) {
- seekDocSource(iter.getTopKey());
- }
- }
-
- @Override
- public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
- iter.seek(range, columnFamilies, inclusive);
- if (iter.hasTop()) {
- seekDocSource(iter.getTopKey());
- }
- }
-
- private void seekDocSource(Key key) throws IOException {
- Key docKey = new Key(key.getRow(), ColumnPrefixes.DOCS_CF_PREFIX, key.getColumnQualifier());
- docSource.seek(new Range(docKey, true, null, false), Collections.<ByteSequence> emptyList(), false);
- }
-
- @Override
- public Key getTopKey() {
- // from intersecting iterator:
- // RowID: shardID
- // CF: (empty)
- // CQ: docID
- return iter.getTopKey();
- }
-
- @Override
- public Value getTopValue() {
- if (!iter.hasTop()) {
- throw new NoSuchElementException();
- }
-
- return docSource.getTopValue();
- }
-
- @Override
- public SortedKeyValueIterator<Key, Value> deepCopy(IteratorEnvironment env) {
- throw new UnsupportedOperationException();
- }
-
- public static void setQuery(IteratorSetting cfg, String query) {
- cfg.addOption(BooleanTreeIterator.queryOptionName, query);
- }
-
- @Override
- public IteratorOptions describeOptions() {
- return new IteratorOptions("FreeTextBooleanTree", "Perform a FreeText Query on properly formated table",
- Collections.singletonMap(queryOptionName, "the free text query"),
- null);
- }
-
- @Override
- public boolean validateOptions(Map<String, String> options) {
- String q = options.get(queryOptionName);
- if (q == null || q.isEmpty())
- throw new IllegalArgumentException(queryOptionName + " must not be empty");
- return true;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java
deleted file mode 100644
index 95783e5..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTExpression.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext.query;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-/**
- * This is a slightly modified version of the ASTExpression file created by JavaCC. This version adds more state to the standard ASTTerm
- * file including a "type", and "notFlag".
- */
-public class ASTExpression extends SimpleNode {
- public static final String AND = "AND";
- public static final String OR = "OR";
-
- private String type = "";
- private boolean notFlag = false;
-
- public ASTExpression(int id) {
- super(id);
- }
-
- public ASTExpression(QueryParser p, int id) {
- super(p, id);
- }
-
- public void setType(String type) {
- this.type = type;
- }
-
- public String getType() {
- return type;
- }
-
- public boolean isNotFlag() {
- return notFlag;
- }
-
- public void setNotFlag(boolean notFlag) {
- this.notFlag = notFlag;
- }
-
- @Override
- public String toString() {
- return super.toString() + " [type: " + type + ", notFlag: " + notFlag + "]";
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/5a03ef61/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java
----------------------------------------------------------------------
diff --git a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java b/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java
deleted file mode 100644
index 27edaac..0000000
--- a/extras/indexing/src/main/java/mvm/rya/indexing/accumulo/freetext/query/ASTNodeUtils.java
+++ /dev/null
@@ -1,210 +0,0 @@
-package mvm.rya.indexing.accumulo.freetext.query;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang.Validate;
-
-public class ASTNodeUtils {
-
- /**
- * Serialize a node (and it's children) to a parsable string.
- *
- * @param s
- * @return
- */
- public static String serializeExpression(Node s) {
- if (s instanceof ASTTerm) {
- ASTTerm a = (ASTTerm) s;
- return (a.isNotFlag() ? "!" : "") + " " + a.getTerm();
- }
-
- String prefix = "";
- String suffix = "";
- String join = " ";
- if (s instanceof ASTExpression) {
- ASTExpression a = (ASTExpression) s;
- prefix = (a.isNotFlag() ? "!" : "") + "(";
- suffix = ")";
- join = " " + a.getType() + " ";
- }
-
- List<String> children = new ArrayList<String>();
- for (int i = 0; i < s.jjtGetNumChildren(); i++) {
- children.add(serializeExpression(s.jjtGetChild(i)));
- }
- return prefix + StringUtils.join(children, join) + suffix;
-
- }
-
- /**
- * count the number of terms in this query tree.
- *
- * @param node
- * @return
- */
- public static int termCount(Node node) {
- if (node instanceof SimpleNode) {
- int count = 0;
- for (SimpleNode n : getNodeIterator((SimpleNode) node)) {
- count += termCount(n);
- }
- return count;
- } else if (node instanceof ASTTerm) {
- return 1;
- } else {
- throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName());
- }
- }
-
- /**
- * Add the child as the parent's first child.
- *
- * @param parent
- * @param child
- */
- public static void pushChild(SimpleNode parent, SimpleNode child) {
- // note: this implementation is very coupled with the SimpleNode jjt implementation
- int parentSize = parent.jjtGetNumChildren();
-
- // expand the parent node
- parent.jjtAddChild(null, parentSize);
-
- // get the current head child
- Node currentHeadChild = parent.jjtGetChild(0);
-
- // set the parameter as the parent's first child
- parent.jjtAddChild(child, 0);
-
- // add the former head child to the end of the list
- if (currentHeadChild != null) {
- parent.jjtAddChild(currentHeadChild, parentSize);
- }
-
- // tie the child to the parent
- child.jjtSetParent(parent);
-
- }
-
- /**
- * Get the index of the child, -1 if child not found.
- *
- * @param parent
- * @param child
- */
- public static int getChildIndex(SimpleNode parent, SimpleNode child) {
- int parentSize = parent.jjtGetNumChildren();
-
- for (int i = 0; i < parentSize; i++) {
- if (child.equals(parent.jjtGetChild(i))) {
- return i;
- }
- }
-
- return -1;
- }
-
- /**
- * return true is all of the node's children have the not flag enabled.
- *
- * @param node
- * @return
- */
- public static boolean allChildrenAreNot(ASTExpression node) {
- for (SimpleNode child : getNodeIterator(node)) {
- if (!isNotFlag(child)) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * return the node's not flag value. node must be of type {@link ASTTerm} or {@link ASTExpression}
- *
- * @param node
- * @return
- */
- public static boolean isNotFlag(Node node) {
- if (node instanceof ASTExpression) {
- return ((ASTExpression) node).isNotFlag();
- } else if (node instanceof ASTTerm) {
- return ((ASTTerm) node).isNotFlag();
- } else {
- throw new IllegalArgumentException("Node is of unknown type: " + node.getClass().getName());
- }
- }
-
- public static Iterable<SimpleNode> getNodeIterator(final SimpleNode n) {
- return new Iterable<SimpleNode>() {
-
- @Override
- public Iterator<SimpleNode> iterator() {
- return new Iterator<SimpleNode>() {
- int pointer = 0;
-
- @Override
- public boolean hasNext() {
- return pointer < n.jjtGetNumChildren();
- }
-
- @Override
- public SimpleNode next() {
- Node rtn = n.jjtGetChild(pointer);
- pointer++;
- return (SimpleNode) rtn;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
- };
- }
-
- public static void swapChildren(ASTExpression parent, int childOneIndex, int childTwoIndex) {
- Validate.isTrue(childOneIndex > -1 && childOneIndex < parent.jjtGetNumChildren());
- Validate.isTrue(childTwoIndex > -1 && childTwoIndex < parent.jjtGetNumChildren());
-
- Node childOne = parent.jjtGetChild(childOneIndex);
- Node childTwo = parent.jjtGetChild(childTwoIndex);
- parent.jjtAddChild(childOne, childTwoIndex);
- parent.jjtAddChild(childTwo, childOneIndex);
- }
-
- public static int findFirstNonNotChild(ASTExpression expression) {
- for (int i = 0; i < expression.jjtGetNumChildren(); i++) {
- if (!isNotFlag(expression.jjtGetChild(i))) {
- return i;
- }
- }
- return -1;
- }
-
-}