You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/03/29 03:23:14 UTC
lucene-solr:master: SOLR-10349: Add totalTermFreq support to
TermsComponent
Repository: lucene-solr
Updated Branches:
refs/heads/master 144091ad2 -> deddc9b5c
SOLR-10349: Add totalTermFreq support to TermsComponent
TermsComponent only returns docFreq information per requested term.
This commit adds a terms.ttf parameter, which if set to true, will
return both docFreq and totalTermFreq statistics for each requested
term.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/deddc9b5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/deddc9b5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/deddc9b5
Branch: refs/heads/master
Commit: deddc9b5c8d8c2859469583fa8b956be48efff82
Parents: 144091a
Author: Shai Erera <sh...@apache.org>
Authored: Thu Mar 23 08:28:05 2017 +0200
Committer: Shai Erera <sh...@apache.org>
Committed: Wed Mar 29 06:18:39 2017 +0300
----------------------------------------------------------------------
solr/CHANGES.txt | 6 +-
.../solr/handler/component/TermsComponent.java | 66 ++++++++++++--------
.../DistributedTermsComponentTest.java | 3 +-
.../handler/component/TermsComponentTest.java | 38 +++++++++++
.../client/solrj/response/QueryResponse.java | 6 +-
.../client/solrj/response/TermsResponse.java | 37 +++++++++--
.../apache/solr/common/params/TermsParams.java | 12 ++--
7 files changed, 126 insertions(+), 42 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 9d14e59..4e63926 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -115,17 +115,19 @@ Detailed Change List
New Features
----------------------
-* SOLR-9992: Add support for grouping with PointFIelds. (Cao Manh Dat)
+* SOLR-9992: Add support for grouping with PointFIelds. (Cao Manh Dat)
* SOLR-10046: Add UninvertDocValuesMergePolicyFactory class. (Keith Laban, Christine Poerschke)
-* SOLR-9994: Add support for CollapseQParser with PointFields. (Varun Thacker, Cao Manh Dat)
+* SOLR-9994: Add support for CollapseQParser with PointFields. (Varun Thacker, Cao Manh Dat)
* SOLR-10076: Hide keystore and truststore passwords from /admin/info/* outputs. (Mano Kovacs via Mark Miller)
* SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya,
Noble Paul, Anshum Gupta, Gregory Chanan)
+* SOLR-10349: Add totalTermFreq support to TermsComponent. (Shai Erera)
+
Optimizations
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
index e00120c..b05939e 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
@@ -108,8 +108,9 @@ public class TermsComponent extends SearchComponent {
}
String termList = params.get(TermsParams.TERMS_LIST);
- if(termList != null) {
- fetchTerms(rb.req.getSearcher(), fields, termList, termsResult);
+ if (termList != null) {
+ boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
+ fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
return;
}
@@ -303,7 +304,7 @@ public class TermsComponent extends SearchComponent {
if (th != null) {
for (ShardResponse srsp : sreq.responses) {
@SuppressWarnings("unchecked")
- NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
+ NamedList<NamedList<Object>> terms = (NamedList<NamedList<Object>>) srsp.getSolrResponse().getResponse().get("terms");
th.parse(terms);
@@ -376,7 +377,7 @@ public class TermsComponent extends SearchComponent {
}
}
- public void parse(NamedList<NamedList<Number>> terms) {
+ public void parse(NamedList<NamedList<Object>> terms) {
// exit if there is no terms
if (terms == null) {
return;
@@ -400,6 +401,7 @@ public class TermsComponent extends SearchComponent {
if (termmap.containsKey(term)) {
TermsResponse.Term oldtc = termmap.get(term);
oldtc.addFrequency(tc.getFrequency());
+ oldtc.addTotalTermFreq(tc.getTotalTermFreq());
termmap.put(term, oldtc);
} else {
termmap.put(term, tc);
@@ -442,7 +444,7 @@ public class TermsComponent extends SearchComponent {
// loop though each field we want terms from
for (String key : fieldmap.keySet()) {
- NamedList<Number> fieldterms = new SimpleOrderedMap<>();
+ NamedList<Object> fieldterms = new SimpleOrderedMap<>();
TermsResponse.Term[] data = null;
if (sort) {
data = getCountSorted(fieldmap.get(key));
@@ -450,11 +452,19 @@ public class TermsComponent extends SearchComponent {
data = getLexSorted(fieldmap.get(key));
}
+ boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
// loop though each term until we hit limit
int cnt = 0;
for (TermsResponse.Term tc : data) {
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
- fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
+ if (includeTotalTermFreq) {
+ NamedList<Number> termStats = new SimpleOrderedMap<>();
+ termStats.add("docFreq", tc.getFrequency());
+ termStats.add("totalTermFreq", tc.getTotalTermFreq());
+ fieldterms.add(tc.getTerm(), termStats);
+ } else {
+ fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
+ }
cnt++;
}
@@ -508,10 +518,9 @@ public class TermsComponent extends SearchComponent {
private void fetchTerms(SolrIndexSearcher indexSearcher,
String[] fields,
String termList,
+ boolean includeTotalTermFreq,
NamedList result) throws IOException {
- NamedList termsMap = new SimpleOrderedMap();
- List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
String field = fields[0];
FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
String[] splitTerms = termList.split(",");
@@ -521,35 +530,43 @@ public class TermsComponent extends SearchComponent {
}
Term[] terms = new Term[splitTerms.length];
- TermContext[] termContexts = new TermContext[terms.length];
for(int i=0; i<splitTerms.length; i++) {
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
}
Arrays.sort(terms);
- collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms);
+ IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext();
+ TermContext[] termContexts = new TermContext[terms.length];
+ collectTermContext(topReaderContext, termContexts, terms);
- for(int i=0; i<terms.length; i++) {
- if(termContexts[i] != null) {
+ NamedList termsMap = new SimpleOrderedMap();
+ for (int i = 0; i < terms.length; i++) {
+ if (termContexts[i] != null) {
String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
int docFreq = termContexts[i].docFreq();
- termsMap.add(outTerm, docFreq);
+ if (!includeTotalTermFreq) {
+ termsMap.add(outTerm, docFreq);
+ } else {
+ long totalTermFreq = termContexts[i].totalTermFreq();
+ NamedList<Long> termStats = new SimpleOrderedMap<>();
+ termStats.add("docFreq", (long) docFreq);
+ termStats.add("totalTermFreq", totalTermFreq);
+ termsMap.add(outTerm, termStats);
+ }
}
}
result.add(field, termsMap);
}
- private void collectTermContext(IndexReader reader,
- List<LeafReaderContext> leaves, TermContext[] contextArray,
- Term[] queryTerms) throws IOException {
+ private void collectTermContext(IndexReaderContext topReaderContext, TermContext[] contextArray, Term[] queryTerms)
+ throws IOException {
TermsEnum termsEnum = null;
- for (LeafReaderContext context : leaves) {
+ for (LeafReaderContext context : topReaderContext.leaves()) {
final Fields fields = context.reader().fields();
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
- TermContext termContext = contextArray[i];
final Terms terms = fields.terms(term.field());
if (terms == null) {
// field does not exist
@@ -559,18 +576,15 @@ public class TermsComponent extends SearchComponent {
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue;
+
+ TermContext termContext = contextArray[i];
if (termsEnum.seekExact(term.bytes())) {
if (termContext == null) {
- contextArray[i] = new TermContext(reader.getContext(),
- termsEnum.termState(), context.ord, termsEnum.docFreq(),
- termsEnum.totalTermFreq());
- } else {
- termContext.register(termsEnum.termState(), context.ord,
- termsEnum.docFreq(), termsEnum.totalTermFreq());
+ termContext = new TermContext(topReaderContext);
+ contextArray[i] = termContext;
}
-
+ termContext.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
}
-
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
index 951cd88..9c90efb 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
@@ -52,7 +52,6 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1");
-
-
+ query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra", "terms.ttf", "true");
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
index 177881a..7fb5e12 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
@@ -18,6 +18,7 @@ package org.apache.solr.handler.component;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.TermsParams;
+import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -313,4 +314,41 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
,"count(//lst[@name='standardfilt']/*)=3"
);
}
+
+ @Test
+ public void testDocFreqAndTotalTermFreq() throws Exception {
+ SolrQueryRequest req = req(
+ "indent","true",
+ "qt", "/terms",
+ "terms", "true",
+ "terms.fl", "standardfilt",
+ "terms.ttf", "true",
+ "terms.list", "snake,spider,shark,ddddd");
+ assertQ(req,
+ "count(//lst[@name='standardfilt']/*)=4",
+ "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='docFreq'][.='4']",
+ "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='totalTermFreq'][.='4']",
+ "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='docFreq'][.='2']",
+ "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='totalTermFreq'][.='2']",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']",
+ "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='docFreq'][.='1']",
+ "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='totalTermFreq'][.='1']");
+ }
+
+ @Test
+ public void testDocFreqAndTotalTermFreqForNonExistingTerm() throws Exception {
+ SolrQueryRequest req = req(
+ "indent","true",
+ "qt", "/terms",
+ "terms", "true",
+ "terms.fl", "standardfilt",
+ "terms.ttf", "true",
+ "terms.list", "boo,snake");
+ assertQ(req,
+ "count(//lst[@name='standardfilt']/*)=1",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
+ "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']");
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
index eb595aa..4e78005 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
@@ -50,7 +50,7 @@ public class QueryResponse extends SolrResponseBase
private List<NamedList<Object>> _clusterInfo = null;
private Map<String,NamedList<Object>> _suggestInfo = null;
private NamedList<Object> _statsInfo = null;
- private NamedList<NamedList<Number>> _termsInfo = null;
+ private NamedList<NamedList<Object>> _termsInfo = null;
private NamedList<SolrDocumentList> _moreLikeThisInfo = null;
private String _cursorMarkNext = null;
@@ -166,7 +166,7 @@ public class QueryResponse extends SolrResponseBase
extractStatsInfo( _statsInfo );
}
else if ( "terms".equals( n ) ) {
- _termsInfo = (NamedList<NamedList<Number>>) res.getVal( i );
+ _termsInfo = (NamedList<NamedList<Object>>) res.getVal( i );
extractTermsInfo( _termsInfo );
}
else if ( "moreLikeThis".equals( n ) ) {
@@ -191,7 +191,7 @@ public class QueryResponse extends SolrResponseBase
_suggestResponse = new SuggesterResponse(suggestInfo);
}
- private void extractTermsInfo(NamedList<NamedList<Number>> termsInfo) {
+ private void extractTermsInfo(NamedList<NamedList<Object>> termsInfo) {
_termsResponse = new TermsResponse(termsInfo);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
index e3fb061..b4ee553 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
@@ -28,17 +28,26 @@ import java.util.Map;
public class TermsResponse {
private Map<String, List<Term>> termMap = new HashMap<>();
- public TermsResponse(NamedList<NamedList<Number>> termsInfo) {
+ public TermsResponse(NamedList<NamedList<Object>> termsInfo) {
for (int i = 0; i < termsInfo.size(); i++) {
String fieldName = termsInfo.getName(i);
List<Term> itemList = new ArrayList<>();
- NamedList<Number> items = termsInfo.getVal(i);
+ NamedList<Object> items = termsInfo.getVal(i);
for (int j = 0; j < items.size(); j++) {
- Term t = new Term(items.getName(j), items.getVal(j).longValue());
+ String term = items.getName(j);
+ Object val = items.getVal(j);
+ Term t;
+ if (val instanceof NamedList) {
+ @SuppressWarnings("unchecked")
+ NamedList<Number> termStats = (NamedList<Number>) val;
+ t = new Term(term, termStats.get("docFreq").longValue(), termStats.get("totalTermFreq").longValue());
+ } else {
+ t = new Term(term, ((Number) val).longValue());
+ }
itemList.add(t);
}
-
+
termMap.put(fieldName, itemList);
}
}
@@ -59,10 +68,16 @@ public class TermsResponse {
public static class Term {
private String term;
private long frequency;
+ private long totalTermFreq;
public Term(String term, long frequency) {
+ this(term, frequency, 0);
+ }
+
+ public Term(String term, long frequency, long totalTermFreq) {
this.term = term;
this.frequency = frequency;
+ this.totalTermFreq = totalTermFreq;
}
public String getTerm() {
@@ -80,9 +95,21 @@ public class TermsResponse {
public void setFrequency(long frequency) {
this.frequency = frequency;
}
-
+
public void addFrequency(long frequency) {
this.frequency += frequency;
}
+
+ public long getTotalTermFreq() {
+ return totalTermFreq;
+ }
+
+ public void setTotalTermFreq(long totalTermFreq) {
+ this.totalTermFreq = totalTermFreq;
+ }
+
+ public void addTotalTermFreq(long totalTermFreq) {
+ this.totalTermFreq += totalTermFreq;
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/deddc9b5/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java b/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
index 4975846..9f96a80 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
@@ -42,17 +42,20 @@ public interface TermsParams {
/**
* Optional. The list of terms to be retrieved.
- *
*/
public static final String TERMS_LIST = TERMS_PREFIX + "list";
/**
- * Optional. The list of terms to be retrieved.
- *
+ * Optional. If true, also returns index-level statistics, such as numDocs.
*/
public static final String TERMS_STATS = TERMS_PREFIX + "stats";
/**
+ * Optional. If true, also returns terms' total term frequency.
+ */
+ public static final String TERMS_TTF = TERMS_PREFIX + "ttf";
+
+ /**
* Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary.
*
* If not specified, the empty string is used
@@ -107,10 +110,11 @@ public interface TermsParams {
}
}
- /**
+ /**
* Optional. The minimum value of docFreq to be returned. 1 by default
*/
public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount";
+
/**
* Optional. The maximum value of docFreq to be returned. -1 by default means no boundary
*/