You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/03/29 04:13:48 UTC

lucene-solr:branch_6x: SOLR-10349: Add totalTermFreq support to TermsComponent

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x f36b2bfbb -> bcc36b900


SOLR-10349: Add totalTermFreq support to TermsComponent

TermsComponent only returns docFreq information per requested term.
This commit adds a terms.ttf parameter, which if set to true, will
return both docFreq and totalTermFreq statistics for each requested
term.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/bcc36b90
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/bcc36b90
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/bcc36b90

Branch: refs/heads/branch_6x
Commit: bcc36b9005afc5a36c1e9fc28ae6a9e5aedcd83d
Parents: f36b2bf
Author: Shai Erera <sh...@apache.org>
Authored: Thu Mar 23 08:28:05 2017 +0200
Committer: Shai Erera <sh...@apache.org>
Committed: Wed Mar 29 06:25:13 2017 +0300

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  2 +
 .../solr/handler/component/TermsComponent.java  | 66 ++++++++++++--------
 .../DistributedTermsComponentTest.java          |  3 +-
 .../handler/component/TermsComponentTest.java   | 38 +++++++++++
 .../client/solrj/response/QueryResponse.java    |  6 +-
 .../client/solrj/response/TermsResponse.java    | 37 +++++++++--
 .../apache/solr/common/params/TermsParams.java  | 12 ++--
 7 files changed, 124 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index bcd7f48..4114558 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -51,6 +51,8 @@ New Features
 * SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya,
   Noble Paul, Anshum Gupta, Gregory Chanan)
 
+* SOLR-10349: Add totalTermFreq support to TermsComponent. (Shai Erera)
+
 Optimizations
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
index e00120c..b05939e 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/TermsComponent.java
@@ -108,8 +108,9 @@ public class TermsComponent extends SearchComponent {
     }
 
     String termList = params.get(TermsParams.TERMS_LIST);
-    if(termList != null) {
-      fetchTerms(rb.req.getSearcher(), fields, termList, termsResult);
+    if (termList != null) {
+      boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
+      fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
       return;
     }
 
@@ -303,7 +304,7 @@ public class TermsComponent extends SearchComponent {
     if (th != null) {
       for (ShardResponse srsp : sreq.responses) {
         @SuppressWarnings("unchecked")
-        NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
+        NamedList<NamedList<Object>> terms = (NamedList<NamedList<Object>>) srsp.getSolrResponse().getResponse().get("terms");
         th.parse(terms);
 
 
@@ -376,7 +377,7 @@ public class TermsComponent extends SearchComponent {
       }
     }
 
-    public void parse(NamedList<NamedList<Number>> terms) {
+    public void parse(NamedList<NamedList<Object>> terms) {
       // exit if there is no terms
       if (terms == null) {
         return;
@@ -400,6 +401,7 @@ public class TermsComponent extends SearchComponent {
           if (termmap.containsKey(term)) {
             TermsResponse.Term oldtc = termmap.get(term);
             oldtc.addFrequency(tc.getFrequency());
+            oldtc.addTotalTermFreq(tc.getTotalTermFreq());
             termmap.put(term, oldtc);
           } else {
             termmap.put(term, tc);
@@ -442,7 +444,7 @@ public class TermsComponent extends SearchComponent {
 
       // loop though each field we want terms from
       for (String key : fieldmap.keySet()) {
-        NamedList<Number> fieldterms = new SimpleOrderedMap<>();
+        NamedList<Object> fieldterms = new SimpleOrderedMap<>();
         TermsResponse.Term[] data = null;
         if (sort) {
           data = getCountSorted(fieldmap.get(key));
@@ -450,11 +452,19 @@ public class TermsComponent extends SearchComponent {
           data = getLexSorted(fieldmap.get(key));
         }
 
+        boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
         // loop though each term until we hit limit
         int cnt = 0;
         for (TermsResponse.Term tc : data) {
           if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
-            fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
+            if (includeTotalTermFreq) {
+              NamedList<Number> termStats = new SimpleOrderedMap<>();
+              termStats.add("docFreq", tc.getFrequency());
+              termStats.add("totalTermFreq", tc.getTotalTermFreq());
+              fieldterms.add(tc.getTerm(), termStats);
+            } else {
+              fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
+            }
             cnt++;
           }
 
@@ -508,10 +518,9 @@ public class TermsComponent extends SearchComponent {
   private void fetchTerms(SolrIndexSearcher indexSearcher,
                           String[] fields,
                           String termList,
+                          boolean includeTotalTermFreq,
                           NamedList result) throws IOException {
 
-    NamedList termsMap = new SimpleOrderedMap();
-    List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
     String field = fields[0];
     FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
     String[] splitTerms = termList.split(",");
@@ -521,35 +530,43 @@ public class TermsComponent extends SearchComponent {
     }
 
     Term[] terms = new Term[splitTerms.length];
-    TermContext[] termContexts = new TermContext[terms.length];
     for(int i=0; i<splitTerms.length; i++) {
       terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
     }
 
     Arrays.sort(terms);
 
-    collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms);
+    IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext();
+    TermContext[] termContexts = new TermContext[terms.length];
+    collectTermContext(topReaderContext, termContexts, terms);
 
-    for(int i=0; i<terms.length; i++) {
-      if(termContexts[i] != null) {
+    NamedList termsMap = new SimpleOrderedMap();
+    for (int i = 0; i < terms.length; i++) {
+      if (termContexts[i] != null) {
         String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
         int docFreq = termContexts[i].docFreq();
-        termsMap.add(outTerm, docFreq);
+        if (!includeTotalTermFreq) {
+          termsMap.add(outTerm, docFreq);
+        } else {
+          long totalTermFreq = termContexts[i].totalTermFreq();
+          NamedList<Long> termStats = new SimpleOrderedMap<>();
+          termStats.add("docFreq", (long) docFreq);
+          termStats.add("totalTermFreq", totalTermFreq);
+          termsMap.add(outTerm, termStats);
+        }
       }
     }
 
     result.add(field, termsMap);
   }
 
-  private void collectTermContext(IndexReader reader,
-                                 List<LeafReaderContext> leaves, TermContext[] contextArray,
-                                 Term[] queryTerms) throws IOException {
+  private void collectTermContext(IndexReaderContext topReaderContext, TermContext[] contextArray, Term[] queryTerms)
+      throws IOException {
     TermsEnum termsEnum = null;
-    for (LeafReaderContext context : leaves) {
+    for (LeafReaderContext context : topReaderContext.leaves()) {
       final Fields fields = context.reader().fields();
       for (int i = 0; i < queryTerms.length; i++) {
         Term term = queryTerms[i];
-        TermContext termContext = contextArray[i];
         final Terms terms = fields.terms(term.field());
         if (terms == null) {
           // field does not exist
@@ -559,18 +576,15 @@ public class TermsComponent extends SearchComponent {
         assert termsEnum != null;
 
         if (termsEnum == TermsEnum.EMPTY) continue;
+
+        TermContext termContext = contextArray[i];
         if (termsEnum.seekExact(term.bytes())) {
           if (termContext == null) {
-            contextArray[i] = new TermContext(reader.getContext(),
-                termsEnum.termState(), context.ord, termsEnum.docFreq(),
-                termsEnum.totalTermFreq());
-          } else {
-            termContext.register(termsEnum.termState(), context.ord,
-                termsEnum.docFreq(), termsEnum.totalTermFreq());
+            termContext = new TermContext(topReaderContext);
+            contextArray[i] = termContext;
           }
-
+          termContext.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
         }
-
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
index 951cd88..9c90efb 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedTermsComponentTest.java
@@ -52,7 +52,6 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
     query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
     query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
     query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1");
-
-
+    query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra", "terms.ttf", "true");
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
index 177881a..7fb5e12 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/TermsComponentTest.java
@@ -18,6 +18,7 @@ package org.apache.solr.handler.component;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.TermsParams;
+import org.apache.solr.request.SolrQueryRequest;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -313,4 +314,41 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
        ,"count(//lst[@name='standardfilt']/*)=3"
     );
   }
+
+  @Test
+  public void testDocFreqAndTotalTermFreq() throws Exception {
+    SolrQueryRequest req = req(
+        "indent","true",
+        "qt", "/terms",
+        "terms", "true",
+        "terms.fl", "standardfilt",
+        "terms.ttf", "true",
+        "terms.list", "snake,spider,shark,ddddd");
+    assertQ(req,
+        "count(//lst[@name='standardfilt']/*)=4",
+        "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='docFreq'][.='4']",
+        "//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='totalTermFreq'][.='4']",
+        "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='docFreq'][.='2']",
+        "//lst[@name='standardfilt']/lst[@name='shark']/long[@name='totalTermFreq'][.='2']",
+        "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
+        "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']",
+        "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='docFreq'][.='1']",
+        "//lst[@name='standardfilt']/lst[@name='spider']/long[@name='totalTermFreq'][.='1']");
+  }
+
+  @Test
+  public void testDocFreqAndTotalTermFreqForNonExistingTerm() throws Exception {
+    SolrQueryRequest req = req(
+        "indent","true",
+        "qt", "/terms",
+        "terms", "true",
+        "terms.fl", "standardfilt",
+        "terms.ttf", "true",
+        "terms.list", "boo,snake");
+    assertQ(req,
+        "count(//lst[@name='standardfilt']/*)=1",
+        "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
+        "//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']");
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
index eb595aa..4e78005 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/QueryResponse.java
@@ -50,7 +50,7 @@ public class QueryResponse extends SolrResponseBase
   private List<NamedList<Object>> _clusterInfo = null;
   private Map<String,NamedList<Object>> _suggestInfo = null;
   private NamedList<Object> _statsInfo = null;
-  private NamedList<NamedList<Number>> _termsInfo = null;
+  private NamedList<NamedList<Object>> _termsInfo = null;
   private NamedList<SolrDocumentList> _moreLikeThisInfo = null;
   private String _cursorMarkNext = null;
 
@@ -166,7 +166,7 @@ public class QueryResponse extends SolrResponseBase
         extractStatsInfo( _statsInfo );
       }
       else if ( "terms".equals( n ) ) {
-        _termsInfo = (NamedList<NamedList<Number>>) res.getVal( i );
+        _termsInfo = (NamedList<NamedList<Object>>) res.getVal( i );
         extractTermsInfo( _termsInfo );
       }
       else if ( "moreLikeThis".equals( n ) ) {
@@ -191,7 +191,7 @@ public class QueryResponse extends SolrResponseBase
     _suggestResponse = new SuggesterResponse(suggestInfo);
   }
 
-  private void extractTermsInfo(NamedList<NamedList<Number>> termsInfo) {
+  private void extractTermsInfo(NamedList<NamedList<Object>> termsInfo) {
     _termsResponse = new TermsResponse(termsInfo);
   }
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
index e3fb061..b4ee553 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/TermsResponse.java
@@ -28,17 +28,26 @@ import java.util.Map;
 public class TermsResponse {
   private Map<String, List<Term>> termMap = new HashMap<>();
   
-  public TermsResponse(NamedList<NamedList<Number>> termsInfo) {
+  public TermsResponse(NamedList<NamedList<Object>> termsInfo) {
     for (int i = 0; i < termsInfo.size(); i++) {
       String fieldName = termsInfo.getName(i);
       List<Term> itemList = new ArrayList<>();
-      NamedList<Number> items = termsInfo.getVal(i);
+      NamedList<Object> items = termsInfo.getVal(i);
       
       for (int j = 0; j < items.size(); j++) {
-        Term t = new Term(items.getName(j), items.getVal(j).longValue());
+        String term = items.getName(j);
+        Object val = items.getVal(j);
+        Term t;
+        if (val instanceof NamedList) {
+          @SuppressWarnings("unchecked")
+          NamedList<Number> termStats = (NamedList<Number>) val;
+          t = new Term(term, termStats.get("docFreq").longValue(), termStats.get("totalTermFreq").longValue());
+        } else {
+          t = new Term(term, ((Number) val).longValue());
+        }
         itemList.add(t);
       }
-      
+
       termMap.put(fieldName, itemList);
     }
   }
@@ -59,10 +68,16 @@ public class TermsResponse {
   public static class Term {
     private String term;
     private long frequency;
+    private long totalTermFreq;
 
     public Term(String term, long frequency) {
+      this(term, frequency, 0);
+    }
+
+    public Term(String term, long frequency, long totalTermFreq) {
       this.term = term;
       this.frequency = frequency;
+      this.totalTermFreq = totalTermFreq;
     }
 
     public String getTerm() {
@@ -80,9 +95,21 @@ public class TermsResponse {
     public void setFrequency(long frequency) {
       this.frequency = frequency;
     }
-    
+
     public void addFrequency(long frequency) {
       this.frequency += frequency;
     }
+
+    public long getTotalTermFreq() {
+      return totalTermFreq;
+    }
+
+    public void setTotalTermFreq(long totalTermFreq) {
+      this.totalTermFreq = totalTermFreq;
+    }
+
+    public void addTotalTermFreq(long totalTermFreq) {
+      this.totalTermFreq += totalTermFreq;
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/bcc36b90/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java b/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
index 4975846..9f96a80 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/TermsParams.java
@@ -42,17 +42,20 @@ public interface TermsParams {
 
   /**
    * Optional. The list of terms to be retrieved.
-   *
    */
   public static final String TERMS_LIST = TERMS_PREFIX + "list";
 
   /**
-   * Optional. The list of terms to be retrieved.
-   *
+   * Optional. If true, also returns index-level statistics, such as numDocs.
    */
   public static final String TERMS_STATS = TERMS_PREFIX + "stats";
 
   /**
+   * Optional. If true, also returns terms' total term frequency.
+   */
+  public static final String TERMS_TTF = TERMS_PREFIX + "ttf";
+
+  /**
    * Optional.  The lower bound term to start at.  The TermEnum will start at the next term after this term in the dictionary.
    *
    * If not specified, the empty string is used
@@ -107,10 +110,11 @@ public interface TermsParams {
       }
   }
 
-    /**
+  /**
    * Optional.  The minimum value of docFreq to be returned.  1 by default
    */
   public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount";
+
   /**
    * Optional.  The maximum value of docFreq to be returned.  -1 by default means no boundary
    */