You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mk...@apache.org on 2019/06/14 13:44:14 UTC
[lucene-solr] branch master updated: SOLR-13333: make terms.ttf
work without terms.list in standalone mode
This is an automated email from the ASF dual-hosted git repository.
mkhl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 2ee0b16 SOLR-13333: make terms.ttf work without terms.list in standalone mode
2ee0b16 is described below
commit 2ee0b16bf1a3b0609b03ae103ef6568feea46d49
Author: Mikhail Khludnev <mk...@apache.org>
AuthorDate: Fri Jun 14 15:41:54 2019 +0200
SOLR-13333: make terms.ttf work without terms.list in standalone mode
---
solr/CHANGES.txt | 4 +-
.../solr/handler/component/TermsComponent.java | 195 ++++++++++-----------
.../solr/handler/component/TermsComponentTest.java | 48 +++++
3 files changed, 148 insertions(+), 99 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 370482d..fb16d7b 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -138,7 +138,9 @@ Bug Fixes
* SOLR-12013: collections API CUSTERSTATUS command fails when configset missing (Erick Erickson)
-* SOLR-13509: NPE on omitHeader=true is fixed by sending omitHeader=false to shard searches (Munendra S N, Mikhail Khludnev)
+* SOLR-13509: NPE on omitHeader=true is fixed by sending omitHeader=false to shard searches (Munendra S N, Mikhail Khludnev)
+
+* SOLR-13333: unleashing terms.ttf from terms.list when distrib=false (Munendra S N via Mikhail Khludnev)
Other Changes
----------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
index fb61a43..f5b03cf 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
@@ -18,11 +18,13 @@ package org.apache.solr.handler.component;
import java.io.IOException;
import java.util.Arrays;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
+
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -146,9 +148,11 @@ public class TermsComponent extends SearchComponent {
String[] fields = params.getParams(TermsParams.TERMS_FIELD);
NamedList<Object> termsResult = new SimpleOrderedMap<>();
- rb.rsp.add("terms", termsResult);
+ rb.rsp.add(TermsParams.TERMS, termsResult);
- if (fields == null || fields.length==0) return;
+ if (fields == null || fields.length == 0) {
+ return;
+ }
boolean termStats = params.getBool(TermsParams.TERMS_STATS, false);
@@ -159,8 +163,8 @@ public class TermsComponent extends SearchComponent {
}
String termList = params.get(TermsParams.TERMS_LIST);
+ boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
if (termList != null) {
- boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
return;
}
@@ -184,17 +188,18 @@ public class TermsComponent extends SearchComponent {
boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
-
final LeafReader indexReader = rb.req.getSearcher().getSlowAtomicReader();
for (String field : fields) {
- NamedList<Integer> fieldTerms = new NamedList<>();
+ NamedList<Object> fieldTerms = new NamedList<>();
+ termsResult.add(field, fieldTerms);
Terms terms = indexReader.terms(field);
if (terms == null) {
// field does not exist in terms index. Check points.
SchemaField sf = rb.req.getSchema().getFieldOrNull(field);
if (sf != null && sf.getType().isPointField()) {
+ // FIXME: terms.ttf=true is not supported for pointFields
if (lowerStr!=null || upperStr!=null || prefix!=null || regexp!=null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, "The terms component does not support Points-based fields with sorting or with parameters %s,%s,%s,%s ", TermsParams.TERMS_LOWER, TermsParams.TERMS_UPPER, TermsParams.TERMS_PREFIX_STR, TermsParams.TERMS_REGEXP_STR));
@@ -203,7 +208,7 @@ public class TermsComponent extends SearchComponent {
if (sort) {
PointMerger.ValueIterator valueIterator = new PointMerger.ValueIterator(sf, rb.req.getSearcher().getRawReader().leaves());
MutableValue mv = valueIterator.getMutableValue();
- BoundedTreeSet<CountPair<MutableValue, Integer>> queue = (sort ? new BoundedTreeSet<>(limit) : null);
+ BoundedTreeSet<CountPair<MutableValue, Integer>> queue = new BoundedTreeSet<>(limit);
for (; ; ) {
long count = valueIterator.getNextCount();
@@ -217,11 +222,8 @@ public class TermsComponent extends SearchComponent {
for (CountPair<MutableValue, Integer> item : queue) {
fieldTerms.add(item.key.toString(), item.val);
}
- termsResult.add(field, fieldTerms);
continue;
- }
-
- if (!sort) {
+ } else {
/***
// streaming solution that is deferred until writing the response
// TODO: we can't use the streaming solution until XML writer supports PushWriter!
@@ -249,22 +251,19 @@ public class TermsComponent extends SearchComponent {
if (++num > limit) break;
fieldTerms.add(mv.toString(), (int)count); // match the numeric type of terms
}
-
- termsResult.add(field, fieldTerms);
continue;
}
}
-
- termsResult.add(field, fieldTerms); // add empty
continue;
}
- termsResult.add(field, fieldTerms);
FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
- if (ft==null) ft = new StrField();
+ if (ft == null) {
+ ft = new StrField();
+ }
// prefix must currently be text
- BytesRef prefixBytes = prefix==null ? null : new BytesRef(prefix);
+ BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);
BytesRef upperBytes = null;
if (upperStr != null) {
@@ -290,9 +289,8 @@ public class TermsComponent extends SearchComponent {
}
}
-
- TermsEnum termsEnum = terms.iterator();
- BytesRef term = null;
+ TermsEnum termsEnum = terms.iterator();
+ BytesRef term = null;
if (lowerBytes != null) {
if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
@@ -310,9 +308,9 @@ public class TermsComponent extends SearchComponent {
}
int i = 0;
- BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
+ BoundedTreeSet<TermsResponse.Term> queue = (sort? new BoundedTreeSet<>(limit, new TermCountComparator()): null);
CharsRefBuilder external = new CharsRefBuilder();
- while (term != null && (i<limit || sort)) {
+ while (term != null && (i < limit || sort)) {
boolean externalized = false; // did we fill in "external" yet for this term?
// stop if the prefix doesn't match
@@ -338,16 +336,16 @@ public class TermsComponent extends SearchComponent {
// This is a good term in the range. Check if mincount/maxcount conditions are satisfied.
int docFreq = termsEnum.docFreq();
if (docFreq >= freqmin && docFreq <= freqmax) {
+ // TODO: handle raw somehow
+ if (!externalized) {
+ ft.indexedToReadable(term, external);
+ }
// add the term to the list
if (sort) {
- queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
+ queue.add(new TermsResponse.Term(external.toString(), docFreq, termsEnum.totalTermFreq()));
} else {
-
- // TODO: handle raw somehow
- if (!externalized) {
- ft.indexedToReadable(term, external);
- }
- fieldTerms.add(external.toString(), docFreq);
+ addTermToNamedList(fieldTerms, external.toString(),
+ docFreq, termsEnum.totalTermFreq(), includeTotalTermFreq);
i++;
}
}
@@ -356,10 +354,12 @@ public class TermsComponent extends SearchComponent {
}
if (sort) {
- for (CountPair<BytesRef, Integer> item : queue) {
- if (i >= limit) break;
- ft.indexedToReadable(item.key, external);
- fieldTerms.add(external.toString(), item.val);
+ for (TermsResponse.Term item : queue) {
+ if (i >= limit) {
+ break;
+ }
+ addTermToNamedList(fieldTerms, item.getTerm(), item.getFrequency(),
+ item.getTotalTermFreq(), includeTotalTermFreq);
i++;
}
}
@@ -438,8 +438,8 @@ public class TermsComponent extends SearchComponent {
rb.rsp.add("terms", terms);
if(ti.stats) {
- NamedList<Number> stats = new SimpleOrderedMap();
- stats.add("numDocs", Long.valueOf(ti.numDocs));
+ NamedList<Number> stats = new SimpleOrderedMap<>();
+ stats.add("numDocs", ti.numDocs);
rb.rsp.add("indexstats", stats);
}
rb._termsHelper = null;
@@ -497,7 +497,7 @@ public class TermsComponent extends SearchComponent {
}
public void parse(NamedList<NamedList<Object>> terms) {
- // exit if there is no terms
+ // exit if there are no terms
if (terms == null) {
return;
}
@@ -540,28 +540,19 @@ public class TermsComponent extends SearchComponent {
sort = false;
}
- // init minimum frequency
- long freqmin = 1;
- String s = params.get(TermsParams.TERMS_MINCOUNT);
- if (s != null) freqmin = Long.parseLong(s);
+ long freqmin = params.getLong(TermsParams.TERMS_MINCOUNT, 1);
- // init maximum frequency, default to max int
- long freqmax = -1;
- s = params.get(TermsParams.TERMS_MAXCOUNT);
- if (s != null) freqmax = Long.parseLong(s);
+ long freqmax = params.getLong(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
if (freqmax < 0) {
freqmax = Long.MAX_VALUE;
}
- // init limit, default to max int
- long limit = 10;
- s = params.get(TermsParams.TERMS_LIMIT);
- if (s != null) limit = Long.parseLong(s);
+ long limit = params.getLong(TermsParams.TERMS_LIMIT, 10);
if (limit < 0) {
limit = Long.MAX_VALUE;
}
- // loop though each field we want terms from
+ // loop through each field we want terms from
for (String key : fieldmap.keySet()) {
NamedList<Object> fieldterms = new SimpleOrderedMap<>();
TermsResponse.Term[] data = null;
@@ -572,18 +563,11 @@ public class TermsComponent extends SearchComponent {
}
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
- // loop though each term until we hit limit
+ // loop through each term until we hit limit
int cnt = 0;
for (TermsResponse.Term tc : data) {
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
- if (includeTotalTermFreq) {
- NamedList<Number> termStats = new SimpleOrderedMap<>();
- termStats.add("df", tc.getFrequency());
- termStats.add("ttf", tc.getTotalTermFreq());
- fieldterms.add(tc.getTerm(), termStats);
- } else {
- fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
- }
+ addTermToNamedList(fieldterms, tc.getTerm(), tc.getFrequency(), tc.getTotalTermFreq(), includeTotalTermFreq);
cnt++;
}
@@ -617,19 +601,7 @@ public class TermsComponent extends SearchComponent {
public TermsResponse.Term[] getCountSorted(HashMap<String, TermsResponse.Term> data) {
TermsResponse.Term[] arr = data.values().toArray(new TermsResponse.Term[data.size()]);
- Arrays.sort(arr, (o1, o2) -> {
- long freq1 = o1.getFrequency();
- long freq2 = o2.getFrequency();
-
- if (freq2 < freq1) {
- return -1;
- } else if (freq1 < freq2) {
- return 1;
- } else {
- return o1.getTerm().compareTo(o2.getTerm());
- }
- });
-
+ Arrays.sort(arr, new TermCountComparator());
return arr;
}
}
@@ -649,40 +621,30 @@ public class TermsComponent extends SearchComponent {
for (String field : fields) {
SchemaField sf = indexSearcher.getSchema().getField(field);
FieldType fieldType = sf.getType();
-
+ NamedList<Object> termsMap = new SimpleOrderedMap<>();
+
if (fieldType.isPointField()) {
- NamedList<Object> termsMap = new SimpleOrderedMap<>();
for (String term : splitTerms) {
Query q = fieldType.getFieldQuery(null, sf, term);
int count = indexSearcher.getDocSet(q).size();
termsMap.add(term, count);
}
- result.add(field, termsMap);
- continue;
- }
-
- // Since splitTerms is already sorted, this array will also be sorted. NOTE: this may not be true, it depends on readableToIndexed.
- Term[] terms = new Term[splitTerms.length];
- for (int i = 0; i < splitTerms.length; i++) {
- terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
- }
-
- TermStates[] termStates = new TermStates[terms.length];
- collectTermStates(topReaderContext, termStates, terms);
+ } else {
- NamedList<Object> termsMap = new SimpleOrderedMap<>();
- for (int i = 0; i < terms.length; i++) {
- if (termStates[i] != null) {
- String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
- int docFreq = termStates[i].docFreq();
- if (!includeTotalTermFreq) {
- termsMap.add(outTerm, docFreq);
- } else {
- long totalTermFreq = termStates[i].totalTermFreq();
- NamedList<Long> termStats = new SimpleOrderedMap<>();
- termStats.add("df", (long) docFreq);
- termStats.add("ttf", totalTermFreq);
- termsMap.add(outTerm, termStats);
+ // Since splitTerms is already sorted, this array will also be sorted. NOTE: this may not be true, it depends on readableToIndexed.
+ Term[] terms = new Term[splitTerms.length];
+ for (int i = 0; i < splitTerms.length; i++) {
+ terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
+ }
+
+ TermStates[] termStates = new TermStates[terms.length];
+ collectTermStates(topReaderContext, termStates, terms);
+
+ for (int i = 0; i < terms.length; i++) {
+ if (termStates[i] != null) {
+ String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
+ int docFreq = termStates[i].docFreq();
+ addTermToNamedList(termsMap, outTerm, docFreq, termStates[i].totalTermFreq(), includeTotalTermFreq);
}
}
}
@@ -719,6 +681,43 @@ public class TermsComponent extends SearchComponent {
}
}
+ /**
+ * Helper method to add particular term to terms response
+ */
+ private static void addTermToNamedList(NamedList<Object> result, String term, long docFreq,
+ long totalTermFreq, boolean includeTotalTermFreq) {
+
+ if (includeTotalTermFreq) {
+ NamedList<Number> termStats = new SimpleOrderedMap<>();
+ termStats.add("df", docFreq);
+ termStats.add("ttf", totalTermFreq);
+ result.add(term, termStats);
+ } else {
+ result.add(term, TermsHelper.num(docFreq));
+ }
+ }
+
+ /**
+ * Comparator for {@link org.apache.solr.client.solrj.response.TermsResponse.Term} sorting
+ * This sorts term by frequency in descending order
+ */
+ public static class TermCountComparator implements Comparator<TermsResponse.Term> {
+
+ @Override
+ public int compare(TermsResponse.Term o1, TermsResponse.Term o2) {
+ long freq1 = o1.getFrequency();
+ long freq2 = o2.getFrequency();
+
+ if (freq2 < freq1) {
+ return -1;
+ } else if (freq1 < freq2) {
+ return 1;
+ } else {
+ return o1.getTerm().compareTo(o2.getTerm());
+ }
+ }
+ }
+
private static void collectStats(SolrIndexSearcher searcher, NamedList<Number> stats) {
int numDocs = searcher.getTopReaderContext().reader().numDocs();
stats.add("numDocs", Long.valueOf(numDocs));
diff --git a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
index 1fa100e..aeb7273 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
@@ -347,6 +347,27 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='ttf'][.='3']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='df'][.='1']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='ttf'][.='1']");
+
+ // terms.limit=-1 and terms.sort=count and NO terms.list
+ req = req(
+ "indent","true",
+ "qt", "/terms",
+ "terms", "true",
+ "terms.fl", "standardfilt",
+ "terms.ttf", "true",
+ "terms.limit", "-1",
+ "terms.sort", "count"
+ );
+ assertQ(req,
+ "count(//lst[@name='standardfilt']/*)>=4", // it would be at-least 4
+ "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='df'][.='4']",
+ "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='ttf'][.='4']",
+ "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='df'][.='2']",
+ "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='ttf'][.='2']",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='df'][.='3']",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='ttf'][.='3']",
+ "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='df'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='ttf'][.='1']");
}
@Test
@@ -389,6 +410,33 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
"//lst[@name='standardfilt']/lst[@name='aa']/long[@name='ttf'][.='1']",
"//lst[@name='standardfilt']/lst[@name='aaa']/long[@name='df'][.='1']",
"//lst[@name='standardfilt']/lst[@name='aaa']/long[@name='ttf'][.='1']");
+
+ // terms.ttf=true, terms.sort=index and no terms list
+ req = req(
+ "indent","true",
+ "qt", "/terms",
+ "terms", "true",
+ "terms.fl", "lowerfilt",
+ "terms.fl", "standardfilt",
+ "terms.ttf", "true",
+ "terms.sort", "index",
+ "terms.limit", "10"
+ );
+ assertQ(req,
+ "count(//lst[@name='lowerfilt']/*)<=10",
+ "count(//lst[@name='standardfilt']/*)<=10",
+ "//lst[@name='lowerfilt']/lst[@name='a']/long[@name='df'][.='2']",
+ "//lst[@name='lowerfilt']/lst[@name='a']/long[@name='ttf'][.='2']",
+ "//lst[@name='lowerfilt']/lst[@name='aa']/long[@name='df'][.='1']",
+ "//lst[@name='lowerfilt']/lst[@name='aa']/long[@name='ttf'][.='1']",
+ "//lst[@name='lowerfilt']/lst[@name='aaa']/long[@name='df'][.='1']",
+ "//lst[@name='lowerfilt']/lst[@name='aaa']/long[@name='ttf'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='a']/long[@name='df'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='a']/long[@name='ttf'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='aa']/long[@name='df'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='aa']/long[@name='ttf'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='aaa']/long[@name='df'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='aaa']/long[@name='ttf'][.='1']");
}
@Test