You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jb...@apache.org on 2016/04/26 22:30:33 UTC
[2/2] lucene-solr:master: SOLR-9027: Add GraphTermsQuery to limit
traversal on high frequency nodes
SOLR-9027: Add GraphTermsQuery to limit traversal on high frequency nodes
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d66f5515
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d66f5515
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d66f5515
Branch: refs/heads/master
Commit: d66f5515e648bdf52f3ea36ae76af72742a95336
Parents: 9c69c4c
Author: jbernste <jb...@apache.org>
Authored: Tue Apr 26 10:15:03 2016 -0400
Committer: jbernste <jb...@apache.org>
Committed: Tue Apr 26 16:33:26 2016 -0400
----------------------------------------------------------------------
.../solr/search/GraphTermsQParserPlugin.java | 292 +++++++++++++++++++
.../org/apache/solr/search/QParserPlugin.java | 1 +
.../apache/solr/search/QueryEqualityTest.java | 9 +
.../search/TestGraphTermsQParserPlugin.java | 135 +++++++++
4 files changed, 437 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d66f5515/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java
new file mode 100644
index 0000000..5a2fe11
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/GraphTermsQParserPlugin.java
@@ -0,0 +1,292 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BulkScorer;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.schema.FieldType;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * The GraphTermsQuery builds a disjunction query from a list of terms. The terms are first filtered by the maxDocFreq parameter.
+ * This allows graph traversals to skip traversing high frequency nodes which is often desirable from a performance standpoint.
+ *
+ * Syntax: {!graphTerms f=field maxDocFreq=10000}term1,term2,term3
+ **/
+
+public class GraphTermsQParserPlugin extends QParserPlugin {
+ public static final String NAME = "graphTerms";
+
+ @Override
+ public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
+ return new QParser(qstr, localParams, params, req) {
+ @Override
+ public Query parse() throws SyntaxError {
+ String fname = localParams.get(QueryParsing.F);
+ FieldType ft = req.getSchema().getFieldTypeNoEx(fname);
+ int maxDocFreq = localParams.getInt("maxDocFreq", Integer.MAX_VALUE);
+ String qstr = localParams.get(QueryParsing.V);//never null
+
+ if (qstr.length() == 0) {
+ return new MatchNoDocsQuery();
+ }
+
+ final String[] splitVals = qstr.split(",");
+
+ BytesRef[] bytesRefs = new BytesRef[splitVals.length];
+ BytesRefBuilder term = new BytesRefBuilder();
+ for (int i = 0; i < splitVals.length; i++) {
+ String stringVal = splitVals[i].trim();
+ if (ft != null) {
+ ft.readableToIndexed(stringVal, term);
+ } else {
+ term.copyChars(stringVal);
+ }
+ bytesRefs[i] = term.toBytesRef();
+ }
+
+ return new ConstantScoreQuery(new GraphTermsQuery(fname, bytesRefs, maxDocFreq));
+ }
+ };
+ }
+
+ private class GraphTermsQuery extends Query {
+
+ private Term[] queryTerms;
+ private List<TermContext> finalContexts;
+ private List<Term> finalTerms;
+ private String field;
+ private int maxDocFreq;
+ private Object id;
+
+ public GraphTermsQuery(String field, BytesRef[] terms, int maxDocFreq) {
+ this.maxDocFreq = maxDocFreq;
+ this.field = field;
+ this.queryTerms = new Term[terms.length];
+ this.id = new Object();
+ for(int i=0; i<terms.length; i++) {
+ this.queryTerms[i] = new Term(field, terms[i]);
+ }
+ }
+
+ //Just for cloning
+ private GraphTermsQuery(String field, Term[] terms, int maxDocFreq, Object id) {
+ this.field = field;
+ this.queryTerms = terms;
+ this.maxDocFreq = maxDocFreq;
+ this.id = id;
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ if(this.finalContexts == null) {
+ //This query has not been re-written yet.
+ //Rewriting the query does not effect the cache key as this query is not designed to be cached.
+ this.finalContexts = new ArrayList();
+ this.finalTerms = new ArrayList();
+ List<LeafReaderContext> contexts = reader.leaves();
+ TermContext[] termContexts = new TermContext[this.queryTerms.length];
+ collectTermContext(reader, contexts, termContexts, this.queryTerms);
+ for(int i=0; i<termContexts.length; i++) {
+ TermContext termContext = termContexts[i];
+ if(termContext != null && termContext.docFreq() < this.maxDocFreq) {
+ this.finalContexts.add(termContext);
+ this.finalTerms.add(queryTerms[i]);
+ }
+ }
+ }
+
+ return this;
+ }
+
+ public int hashCode() {
+ return 31 * super.hashCode() + id.hashCode();
+ }
+
+ public boolean equals(Object o) {
+ if (super.equals(o) == false) {
+ return false;
+ }
+
+ GraphTermsQuery q = (GraphTermsQuery)o;
+ return id == q.id;
+ }
+
+ public GraphTermsQuery clone() {
+ GraphTermsQuery clone = new GraphTermsQuery(this.field, this.queryTerms, this.maxDocFreq, this.id);
+ return clone;
+ }
+
+ @Override
+ public String toString(String defaultField) {
+ StringBuilder builder = new StringBuilder();
+ boolean first = true;
+ for (Term term : this.queryTerms) {
+ if (!first) {
+ builder.append(',');
+ }
+ first = false;
+ builder.append(term.toString());
+ }
+
+ return builder.toString();
+ }
+
+ private class WeightOrDocIdSet {
+ final Weight weight;
+ final DocIdSet set;
+
+ WeightOrDocIdSet(DocIdSet bitset) {
+ this.set = bitset;
+ this.weight = null;
+ }
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
+ return new ConstantScoreWeight(this) {
+
+ @Override
+ public void extractTerms(Set<Term> terms) {
+ // no-op
+ // This query is for abuse cases when the number of terms is too high to
+ // run efficiently as a BooleanQuery. So likewise we hide its terms in
+ // order to protect highlighters
+ }
+
+ private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
+ final LeafReader reader = context.reader();
+ final Fields fields = reader.fields();
+ Terms terms = fields.terms(field);
+ TermsEnum termsEnum = terms.iterator();
+ PostingsEnum docs = null;
+ DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
+ for (int i=0; i<finalContexts.size(); i++) {
+ TermContext termContext = finalContexts.get(i);
+ TermState termState = termContext.get(context.ord);
+ if(termState != null) {
+ Term term = finalTerms.get(i);
+ termsEnum.seekExact(term.bytes(), termContext.get(context.ord));
+ docs = termsEnum.postings(docs, PostingsEnum.NONE);
+ builder.add(docs);
+ }
+ }
+ return new WeightOrDocIdSet(builder.build());
+ }
+
+ private Scorer scorer(DocIdSet set) throws IOException {
+ if (set == null) {
+ return null;
+ }
+ final DocIdSetIterator disi = set.iterator();
+ if (disi == null) {
+ return null;
+ }
+ return new ConstantScoreScorer(this, score(), disi);
+ }
+
+ @Override
+ public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
+ final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+ if (weightOrBitSet.weight != null) {
+ return weightOrBitSet.weight.bulkScorer(context);
+ } else {
+ final Scorer scorer = scorer(weightOrBitSet.set);
+ if (scorer == null) {
+ return null;
+ }
+ return new DefaultBulkScorer(scorer);
+ }
+ }
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ final WeightOrDocIdSet weightOrBitSet = rewrite(context);
+ if (weightOrBitSet.weight != null) {
+ return weightOrBitSet.weight.scorer(context);
+ } else {
+ return scorer(weightOrBitSet.set);
+ }
+ }
+ };
+ }
+
+ private void collectTermContext(IndexReader reader,
+ List<LeafReaderContext> leaves,
+ TermContext[] contextArray,
+ Term[] queryTerms) throws IOException {
+ TermsEnum termsEnum = null;
+ for (LeafReaderContext context : leaves) {
+ final Fields fields = context.reader().fields();
+ for (int i = 0; i < queryTerms.length; i++) {
+ Term term = queryTerms[i];
+ TermContext termContext = contextArray[i];
+ final Terms terms = fields.terms(term.field());
+ if (terms == null) {
+ // field does not exist
+ continue;
+ }
+ termsEnum = terms.iterator();
+ assert termsEnum != null;
+
+ if (termsEnum == TermsEnum.EMPTY) continue;
+ if (termsEnum.seekExact(term.bytes())) {
+ if (termContext == null) {
+ contextArray[i] = new TermContext(reader.getContext(),
+ termsEnum.termState(), context.ord, termsEnum.docFreq(),
+ termsEnum.totalTermFreq());
+ } else {
+ termContext.register(termsEnum.termState(), context.ord,
+ termsEnum.docFreq(), termsEnum.totalTermFreq());
+ }
+
+ }
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d66f5515/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
index cf556ff..7a6247f 100644
--- a/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/QParserPlugin.java
@@ -76,6 +76,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
map.put(HashQParserPlugin.NAME, HashQParserPlugin.class);
map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class);
map.put(XmlQParserPlugin.NAME, XmlQParserPlugin.class);
+ map.put(GraphTermsQParserPlugin.NAME, GraphTermsQParserPlugin.class);
standardPlugins = Collections.unmodifiableMap(map);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d66f5515/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
index a9b0209..2a89473 100644
--- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
+++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java
@@ -165,6 +165,15 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
}
}
+ public void testGraphTermsQuery() throws Exception {
+ SolrQueryRequest req = req("q", "*:*");
+ try {
+ assertQueryEquals("graphTerms", req, "{!graphTerms f=field1 maxDocFreq=1000}term1,term2");
+ } finally {
+ req.close();
+ }
+ }
+
public void testQuerySwitch() throws Exception {
SolrQueryRequest req = req("myXXX", "XXX",
"myField", "foo_s",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d66f5515/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java b/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java
new file mode 100644
index 0000000..cfff980
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/search/TestGraphTermsQParserPlugin.java
@@ -0,0 +1,135 @@
+package org.apache.solr.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelector;
+import org.apache.solr.search.CollapsingQParserPlugin.GroupHeadSelectorType;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+//We want codecs that support DocValues, and ones supporting blank/empty values.
+@SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42"})
+public class TestGraphTermsQParserPlugin extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-collapseqparser.xml", "schema11.xml");
+ }
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ // if you override setUp or tearDown, you better call
+ // the super classes version
+ super.setUp();
+ clearIndex();
+ assertU(commit());
+ }
+
+ @Test
+ public void testQueries() throws Exception {
+
+ String group = "group_s";
+
+ String[] doc = {"id","1", "term_s", "YYYY", group, "1", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
+ assertU(adoc(doc));
+ String[] doc1 = {"id","2", "term_s","YYYY", group, "1", "test_ti", "5", "test_tl", "100", "test_tf", "200"};
+ assertU(adoc(doc1));
+
+ String[] doc2 = {"id","3", "term_s", "YYYY", "test_ti", "5000", "test_tl", "100", "test_tf", "200"};
+ assertU(adoc(doc2));
+ assertU(commit());
+ String[] doc3 = {"id","4", "term_s", "YYYY", "test_ti", "500", "test_tl", "1000", "test_tf", "2000"};
+ assertU(adoc(doc3));
+
+ String[] doc4 = {"id","5", "term_s", "YYYY", group, "2", "test_ti", "5", "test_tl", "10", "test_tf", "2000"};
+ assertU(adoc(doc4));
+ assertU(commit());
+ String[] doc5 = {"id","6", "term_s","YYYY", group, "2", "test_ti", "10", "test_tl", "100", "test_tf", "200"};
+ assertU(adoc(doc5));
+ assertU(commit());
+
+ String[] doc6 = {"id","7", "term_s", "YYYY", group, "1", "test_ti", "10", "test_tl", "50", "test_tf", "300"};
+ assertU(adoc(doc6));
+ assertU(commit());
+
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.add("q", "{!graphTerms f=group_s maxDocFreq=10}1,2");
+ params.add("sort", "id asc");
+ assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+ "//result/doc[1]/float[@name='id'][.='1.0']",
+ "//result/doc[2]/float[@name='id'][.='2.0']",
+ "//result/doc[3]/float[@name='id'][.='5.0']",
+ "//result/doc[4]/float[@name='id'][.='6.0']",
+ "//result/doc[5]/float[@name='id'][.='7.0']"
+ );
+
+ //Test without maxDocFreq param. Should default to Integer.MAX_VALUE and match all terms.
+ params = new ModifiableSolrParams();
+ params.add("q", "{!graphTerms f=group_s}1,2");
+ params.add("sort", "id asc");
+ assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+ "//result/doc[1]/float[@name='id'][.='1.0']",
+ "//result/doc[2]/float[@name='id'][.='2.0']",
+ "//result/doc[3]/float[@name='id'][.='5.0']",
+ "//result/doc[4]/float[@name='id'][.='6.0']",
+ "//result/doc[5]/float[@name='id'][.='7.0']"
+ );
+
+ params = new ModifiableSolrParams();
+ params.add("q", "{!graphTerms f=group_s maxDocFreq=1}1,2");
+ params.add("sort", "id asc");
+ assertQ(req(params, "indent", "on"), "*[count(//doc)=0]"
+ );
+
+ //Test with int field
+ params = new ModifiableSolrParams();
+ params.add("q", "{!graphTerms f=test_ti maxDocFreq=10}5,10");
+ params.add("sort", "id asc");
+ assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
+ "//result/doc[1]/float[@name='id'][.='1.0']",
+ "//result/doc[2]/float[@name='id'][.='2.0']",
+ "//result/doc[3]/float[@name='id'][.='5.0']",
+ "//result/doc[4]/float[@name='id'][.='6.0']",
+ "//result/doc[5]/float[@name='id'][.='7.0']"
+ );
+
+ //Test with int field
+ params = new ModifiableSolrParams();
+ params.add("q", "{!graphTerms f=test_ti maxDocFreq=3}5,10");
+ params.add("sort", "id asc");
+ assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
+ "//result/doc[1]/float[@name='id'][.='6.0']",
+ "//result/doc[2]/float[@name='id'][.='7.0']"
+ );
+ }
+}