You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2015/10/21 11:42:38 UTC
svn commit: r1709780 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/classification/ lucene/classification/src/ lucene/classification/src/java/org/apache/lucene/classification/ lucene/core/ lucene/core/src/java/org/apache/lucene/index/ lucene/co...

Author: tommaso
Date: Wed Oct 21 09:42:38 2015
New Revision: 1709780

URL: http://svn.apache.org/viewvc?rev=1709780&view=rev
Log:
LUCENE-6821 - TermQuery's constructors should clone the incoming term (backport branch 5.x)

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/classification/   (props changed)
    lucene/dev/branches/branch_5x/lucene/classification/src/   (props changed)
    lucene/dev/branches/branch_5x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Term.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/schema/FieldType.java
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/FacetField.java
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java

Modified: lucene/dev/branches/branch_5x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/branch_5x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Wed Oct 21 09:42:38 2015
@@ -166,8 +166,12 @@ public class SimpleNaiveBayesClassifier
     String[] tokenizedDoc = tokenizeDoc(inputDocument);
     int docsWithClassSize = countDocsWithClass();
     while ((next = termsEnum.next()) != null) {
-      double clVal = calculateLogPrior(next, docsWithClassSize) + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize);
-      dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal));
+      if (next.length > 0) {
+        // We are passing the term to IndexSearcher so we need to make sure it will not change over time
+        Term term = new Term(this.classFieldName, next);
+        double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedDoc, term, docsWithClassSize);
+        dataList.add(new ClassificationResult<>(term.bytes(), clVal));
+      }
     }
 
     // normalization; the values transforms to a 0-1 range
@@ -240,18 +244,18 @@ public class SimpleNaiveBayesClassifier
     return result.toArray(new String[result.size()]);
   }
 
-  private double calculateLogLikelihood(String[] tokenizedDoc, BytesRef c, int docsWithClassSize) throws IOException {
+  private double calculateLogLikelihood(String[] tokenizedText, Term term, int docsWithClass) throws IOException {
     // for each word
     double result = 0d;
-    for (String word : tokenizedDoc) {
+    for (String word : tokenizedText) {
       // search with text:word AND class:c
-      int hits = getWordFreqForClass(word, c);
+      int hits = getWordFreqForClass(word, term);
 
       // num : count the no of times the word appears in documents of class c (+1)
       double num = hits + 1; // +1 is added because of add 1 smoothing
 
       // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|)
-      double den = getTextTermFreqForClass(c) + docsWithClassSize;
+      double den = getTextTermFreqForClass(term) + docsWithClass;
 
       // P(w|c) = num/den
       double wordProbability = num / den;
@@ -262,25 +266,39 @@ public class SimpleNaiveBayesClassifier
     return result;
   }
 
-  private double getTextTermFreqForClass(BytesRef c) throws IOException {
+  /**
+   * Returns the average number of unique terms times the number of docs belonging to the input class
+   * @param term the term representing the class
+   * @return the average number of unique terms
+   * @throws IOException if a low level I/O problem happens
+   */
+  private double getTextTermFreqForClass(Term term) throws IOException {
     double avgNumberOfUniqueTerms = 0;
     for (String textFieldName : textFieldNames) {
       Terms terms = MultiFields.getTerms(leafReader, textFieldName);
       long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
       avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
     }
-    int docsWithC = leafReader.docFreq(new Term(classFieldName, c));
+    int docsWithC = leafReader.docFreq(term);
     return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
   }
 
-  private int getWordFreqForClass(String word, BytesRef c) throws IOException {
+  /**
+   * Returns the number of documents of the input class ( from the whole index or from a subset)
+   * that contains the word ( in a specific field or in all the fields if no one selected)
+   * @param word the token produced by the analyzer
+   * @param term the term representing the class
+   * @return the number of documents of the input class
+   * @throws IOException if a low level I/O problem happens
+   */
+  private int getWordFreqForClass(String word, Term term) throws IOException {
     BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
     BooleanQuery.Builder subQuery = new BooleanQuery.Builder();
     for (String textFieldName : textFieldNames) {
       subQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
     }
     booleanQuery.add(new BooleanClause(subQuery.build(), BooleanClause.Occur.MUST));
-    booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
+    booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST));
     if (query != null) {
       booleanQuery.add(query, BooleanClause.Occur.MUST);
     }
@@ -289,11 +307,11 @@ public class SimpleNaiveBayesClassifier
     return totalHitCountCollector.getTotalHits();
   }
 
-  private double calculateLogPrior(BytesRef currentClass, int docsWithClassSize) throws IOException {
-    return Math.log((double) docCount(currentClass)) - Math.log(docsWithClassSize);
+  private double calculateLogPrior(Term term, int docsWithClassSize) throws IOException {
+    return Math.log((double) docCount(term)) - Math.log(docsWithClassSize);
   }
 
-  private int docCount(BytesRef countedClass) throws IOException {
-    return leafReader.docFreq(new Term(classFieldName, countedClass));
+  private int docCount(Term term) throws IOException {
+    return leafReader.docFreq(term);
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Term.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Term.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Term.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/index/Term.java Wed Oct 21 09:42:38 2015
@@ -24,6 +24,7 @@ import java.nio.charset.CodingErrorActio
 import java.nio.charset.StandardCharsets;
 
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
 
 /**
   A Term represents a word from text.  This is the unit of search.  It is
@@ -39,16 +40,22 @@ public final class Term implements Compa
 
   /** Constructs a Term with the given field and bytes.
    * <p>Note that a null field or null bytes value results in undefined
-   * behavior for most Lucene APIs that accept a Term parameter. 
+   * behavior for most Lucene APIs that accept a Term parameter.
    *
-   * <p>WARNING: the provided BytesRef is not copied, but used directly.
-   * Therefore the bytes should not be modified after construction, for
-   * example, you should clone a copy by {@link BytesRef#deepCopyOf}
-   * rather than pass reused bytes from a TermsEnum.
+   * <p>The provided BytesRef is copied when it is non null.
    */
   public Term(String fld, BytesRef bytes) {
     field = fld;
-    this.bytes = bytes;
+    this.bytes = bytes == null ? null : BytesRef.deepCopyOf(bytes);
+  }
+
+  /** Constructs a Term with the given field and the bytes from a builder.
+   * <p>Note that a null field value results in undefined
+   * behavior for most Lucene APIs that accept a Term parameter.
+   */
+  public Term(String fld, BytesRefBuilder bytesBuilder) {
+    field = fld;
+    this.bytes = bytesBuilder.toBytesRef();
   }
 
   /** Constructs a Term with the given field and text.
@@ -61,7 +68,7 @@ public final class Term implements Compa
   /** Constructs a Term with the given field and empty text.
    * This serves two purposes: 1) reuse of a Term with the same field.
    * 2) pattern for a query.
-   * 
+   *
    * @param fld field's name
    */
   public Term(String fld) {
@@ -75,10 +82,10 @@ public final class Term implements Compa
   /** Returns the text of this term.  In the case of words, this is simply the
     text of the word.  In the case of dates and other types, this is an
     encoding of the object as a string.  */
-  public final String text() { 
+  public final String text() {
     return toString(bytes);
   }
-  
+
   /** Returns human-readable form of the term text. If the term is not unicode,
    * the raw bytes will be printed instead. */
   public static final String toString(BytesRef termText) {
@@ -93,7 +100,7 @@ public final class Term implements Compa
     }
   }
 
-  /** Returns the bytes of this term. */
+  /** Returns the bytes of this term, these should not be modified. */
   public final BytesRef bytes() { return bytes; }
 
   @Override
@@ -141,8 +148,8 @@ public final class Term implements Compa
     }
   }
 
-  /** 
-   * Resets the field and text of a Term. 
+  /**
+   * Resets the field and text of a Term.
    * <p>WARNING: the provided BytesRef is not copied, but used directly.
    * Therefore the bytes should not be modified after construction, for
    * example, you should clone a copy rather than pass reused bytes from

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java Wed Oct 21 09:42:38 2015
@@ -93,7 +93,7 @@ public final class BlendedTermQuery exte
       terms = ArrayUtil.grow(terms, numTerms + 1);
       boosts = ArrayUtil.grow(boosts, numTerms + 1);
       contexts = ArrayUtil.grow(contexts, numTerms + 1);
-      terms[numTerms] = new Term(term.field(), BytesRef.deepCopyOf(term.bytes()));
+      terms[numTerms] = term;
       boosts[numTerms] = boost;
       contexts[numTerms] = context;
       numTerms += 1;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java Wed Oct 21 09:42:38 2015
@@ -103,7 +103,6 @@ public class PhraseQuery extends Query {
      * 
      */
     public Builder add(Term term, int position) {
-      term = new Term(term.field(), BytesRef.deepCopyOf(term.bytes())); // be defensive
       if (position < 0) {
         throw new IllegalArgumentException("Positions must be >= 0, got " + position);
       }
@@ -193,7 +192,7 @@ public class PhraseQuery extends Query {
   private static Term[] toTerms(String field, BytesRef... termBytes) {
     Term[] terms = new Term[termBytes.length];
     for (int i = 0; i < terms.length; ++i) {
-      terms[i] = new Term(field, BytesRef.deepCopyOf(termBytes[i]));
+      terms[i] = new Term(field, termBytes[i]);
     }
     return terms;
   }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java Wed Oct 21 09:42:38 2015
@@ -273,7 +273,7 @@ public class QueryBuilder {
       throw new AssertionError();
     }
     
-    return newTermQuery(new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())));
+    return newTermQuery(new Term(field, termAtt.getBytesRef()));
   }
   
   /** 
@@ -287,7 +287,7 @@ public class QueryBuilder {
     
     stream.reset();
     while (stream.incrementToken()) {
-      Query currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())));
+      Query currentQuery = newTermQuery(new Term(field, termAtt.getBytesRef()));
       q.add(currentQuery, BooleanClause.Occur.SHOULD);
     }
     
@@ -322,7 +322,7 @@ public class QueryBuilder {
         add(q, currentQuery.build(), operator);
         currentQuery = newBooleanQuery(true);
       }
-      currentQuery.add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
+      currentQuery.add(newTermQuery(new Term(field, termAtt.getBytesRef())), BooleanClause.Occur.SHOULD);
     }
     add(q, currentQuery.build(), operator);
     
@@ -380,7 +380,7 @@ public class QueryBuilder {
         multiTerms.clear();
       }
       position += positionIncrement;
-      multiTerms.add(new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())));
+      multiTerms.add(new Term(field, termAtt.getBytesRef()));
     }
     
     if (enablePositionIncrements) {

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/schema/FieldType.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/schema/FieldType.java Wed Oct 21 09:42:38 2015
@@ -745,7 +745,7 @@ public abstract class FieldType extends
       // match-only
       return getRangeQuery(parser, field, externalVal, externalVal, true, true);
     } else {
-      return new TermQuery(new Term(field.getName(), br.toBytesRef()));
+      return new TermQuery(new Term(field.getName(), br));
     }
   }
   

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java Wed Oct 21 09:42:38 2015
@@ -1176,7 +1176,7 @@ public class SolrIndexSearcher extends I
     TermQuery key = null;
 
     if (useCache) {
-      key = new TermQuery(new Term(deState.fieldName, BytesRef.deepCopyOf(deState.termsEnum.term())));
+      key = new TermQuery(new Term(deState.fieldName, deState.termsEnum.term()));
       DocSet result = filterCache.get(key);
       if (result != null) return result;
     }

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/FacetField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/FacetField.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/FacetField.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/FacetField.java Wed Oct 21 09:42:38 2015
@@ -653,7 +653,7 @@ abstract class FacetFieldProcessorFCBase
 
       bucket.add("val", val);
 
-      TermQuery filter = needFilter ? new TermQuery(new Term(sf.getName(), BytesRef.deepCopyOf(br))) : null;
+      TermQuery filter = needFilter ? new TermQuery(new Term(sf.getName(), br)) : null;
       fillBucket(bucket, countAcc.getCount(slotNum), slotNum, null, filter);
 
       bucketList.add(bucket);
@@ -1158,15 +1158,14 @@ class FacetFieldProcessorStream extends
 
         // OK, we have a good bucket to return... first get bucket value before moving to next term
         Object bucketVal = sf.getType().toObject(sf, term);
-        BytesRef termCopy = BytesRef.deepCopyOf(term);
+        TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null;
         term = termsEnum.next();
 
         SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>();
         bucket.add("val", bucketVal);
         addStats(bucket, 0);
         if (hasSubFacets) {
-          TermQuery filter = new TermQuery(new Term(freq.field, termCopy));
-          processSubs(bucket, filter, termSet);
+          processSubs(bucket, bucketQuery, termSet);
         }
 
         // TODO... termSet needs to stick around for streaming sub-facets?

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java Wed Oct 21 09:42:38 2015
@@ -122,10 +122,11 @@ public class UnInvertedField extends Doc
     final BytesRef term = te.term();
 
     if (te.docFreq() > maxTermDocFreq) {
+      Term t = new Term(field, term);  // this makes a deep copy of the term bytes
       TopTerm topTerm = new TopTerm();
-      topTerm.term = BytesRef.deepCopyOf(term);
+      topTerm.term = t.bytes();
       topTerm.termNum = termNum;
-      topTerm.termQuery = new TermQuery(new Term(field, topTerm.term));
+      topTerm.termQuery = new TermQuery(t);
 
       bigTerms.put(topTerm.termNum, topTerm);
 

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java?rev=1709780&r1=1709779&r2=1709780&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/mlt/SimpleMLTQParser.java Wed Oct 21 09:42:38 2015
@@ -133,7 +133,7 @@ public class SimpleMLTQParser extends QP
     BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
     bytesRefBuilder.grow(NumericUtils.BUF_SIZE_INT);
     NumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
-    return new Term(field, bytesRefBuilder.toBytesRef());
+    return new Term(field, bytesRefBuilder);
   }