You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2022/11/21 09:55:22 UTC

[lucene] branch main updated: Add field as a separate input to newSynonymQuery (#11941)

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 332679886c0 Add field as a separate input to newSynonymQuery (#11941)
332679886c0 is described below

commit 332679886c099d45b69e05450b63853fe0ad071a
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Mon Nov 21 09:55:14 2022 +0000

    Add field as a separate input to newSynonymQuery (#11941)
    
    QueryBuilder#newSynonymQuery takes an array of TermAndBoost objects as a
    parameter and uses the field of the first term in the array as its field. However,
    there are cases where this array may be empty, which will result in an
    ArrayOutOfBoundsException.
    
    This commit reworks QueryBuilder so that TermAndBoost contains plain
    BytesRefs, and passes the field as a separate parameter. This guards against
    accidental calls to newSynonymQuery with an empty list - in this case, an
    empty synonym query is generated rather than an exception. It also
    refactors SynonymQuery itself to hold BytesRefs rather than Terms, which
    needlessly repeat the field for each entry.
    
    Fixes #11864
---
 lucene/CHANGES.txt                                 |  4 ++
 .../org/apache/lucene/search/SynonymQuery.java     | 48 +++++++++++-----------
 .../java/org/apache/lucene/util/QueryBuilder.java  | 38 +++++++++--------
 3 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9fc38a90e73..5257c959e32 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -111,6 +111,10 @@ API Changes
   memory to provide stronger guarantees on query latency.
   (Adrien Grand, Uwe Schindler)
 
+* GITHUB#11941: QueryBuilder#add and #newSynonymQuery methods now take a `field` parameter,
+  to avoid possible exceptions when building queries from an empty term list.  The helper
+  TermAndBoost class now holds a BytesRef rather than a Term. (Alan Woodward)
+
 New Features
 ---------------------
 * GITHUB#11795: Add ByteWritesTrackingDirectoryWrapper to expose metrics for bytes merged, flushed, and overall
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index 0b68dc35681..ca32a7d7bb8 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -80,6 +80,14 @@ public final class SynonymQuery extends Query {
       if (field.equals(term.field()) == false) {
         throw new IllegalArgumentException("Synonyms must be across the same field");
       }
+      return addTerm(term.bytes(), boost);
+    }
+
+    /**
+     * Adds the provided {@code term} as a synonym, document frequencies of this term will be
+     * boosted by {@code boost}.
+     */
+    public Builder addTerm(BytesRef term, float boost) {
       if (Float.isNaN(boost) || Float.compare(boost, 0f) <= 0 || Float.compare(boost, 1f) > 0) {
         throw new IllegalArgumentException(
             "boost must be a positive float between 0 (exclusive) and 1 (inclusive)");
@@ -110,7 +118,7 @@ public final class SynonymQuery extends Query {
 
   public List<Term> getTerms() {
     return Collections.unmodifiableList(
-        Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList()));
+        Arrays.stream(terms).map(t -> new Term(field, t.term)).collect(Collectors.toList()));
   }
 
   @Override
@@ -120,7 +128,7 @@ public final class SynonymQuery extends Query {
       if (i != 0) {
         builder.append(" ");
       }
-      Query termQuery = new TermQuery(terms[i].term);
+      Query termQuery = new TermQuery(new Term(this.field, terms[i].term));
       builder.append(termQuery.toString(field));
       if (terms[i].boost != 1f) {
         builder.append("^");
@@ -148,7 +156,7 @@ public final class SynonymQuery extends Query {
       return new BooleanQuery.Builder().build();
     }
     if (terms.length == 1 && terms[0].boost == 1f) {
-      return new TermQuery(terms[0].term);
+      return new TermQuery(new Term(field, terms[0].term));
     }
     return this;
   }
@@ -159,7 +167,7 @@ public final class SynonymQuery extends Query {
       return;
     }
     QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this);
-    Term[] ts = Arrays.stream(terms).map(t -> t.term).toArray(Term[]::new);
+    Term[] ts = Arrays.stream(terms).map(t -> new Term(field, t.term)).toArray(Term[]::new);
     v.consumeTerms(this, ts);
   }
 
@@ -172,7 +180,7 @@ public final class SynonymQuery extends Query {
       // if scores are not needed, let BooleanWeight deal with optimizing that case.
       BooleanQuery.Builder bq = new BooleanQuery.Builder();
       for (TermAndBoost term : terms) {
-        bq.add(new TermQuery(term.term), BooleanClause.Occur.SHOULD);
+        bq.add(new TermQuery(new Term(field, term.term)), BooleanClause.Occur.SHOULD);
       }
       return searcher
           .rewrite(bq.build())
@@ -191,16 +199,17 @@ public final class SynonymQuery extends Query {
       super(query);
       assert scoreMode.needsScores();
       this.scoreMode = scoreMode;
-      CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].term.field());
+      CollectionStatistics collectionStats = searcher.collectionStatistics(field);
       long docFreq = 0;
       long totalTermFreq = 0;
       termStates = new TermStates[terms.length];
       for (int i = 0; i < termStates.length; i++) {
-        TermStates ts = TermStates.build(searcher.getTopReaderContext(), terms[i].term, true);
+        Term term = new Term(field, terms[i].term);
+        TermStates ts = TermStates.build(searcher.getTopReaderContext(), term, true);
         termStates[i] = ts;
         if (ts.docFreq() > 0) {
           TermStatistics termStats =
-              searcher.termStatistics(terms[i].term, ts.docFreq(), ts.totalTermFreq());
+              searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
           docFreq = Math.max(termStats.docFreq(), docFreq);
           totalTermFreq += termStats.totalTermFreq();
         }
@@ -217,13 +226,12 @@ public final class SynonymQuery extends Query {
 
     @Override
     public Matches matches(LeafReaderContext context, int doc) throws IOException {
-      String field = terms[0].term.field();
       Terms indexTerms = context.reader().terms(field);
       if (indexTerms == null) {
         return super.matches(context, doc);
       }
       List<Term> termList =
-          Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList());
+          Arrays.stream(terms).map(t -> new Term(field, t.term)).collect(Collectors.toList());
       return MatchesUtils.forField(
           field,
           () -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, termList));
@@ -244,8 +252,7 @@ public final class SynonymQuery extends Query {
             assert scorer instanceof TermScorer;
             freq = ((TermScorer) scorer).freq();
           }
-          LeafSimScorer docScorer =
-              new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true);
+          LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
           Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
           Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
           return Explanation.match(
@@ -271,8 +278,8 @@ public final class SynonymQuery extends Query {
       for (int i = 0; i < terms.length; i++) {
         TermState state = termStates[i].get(context);
         if (state != null) {
-          TermsEnum termsEnum = context.reader().terms(terms[i].term.field()).iterator();
-          termsEnum.seekExact(terms[i].term.bytes(), state);
+          TermsEnum termsEnum = context.reader().terms(field).iterator();
+          termsEnum.seekExact(terms[i].term, state);
           if (scoreMode == ScoreMode.TOP_SCORES) {
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
@@ -290,8 +297,7 @@ public final class SynonymQuery extends Query {
         return null;
       }
 
-      LeafSimScorer simScorer =
-          new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true);
+      LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
 
       // we must optimize this case (term not in segment), disjunctions require >= 2 subs
       if (iterators.size() == 1) {
@@ -443,7 +449,7 @@ public final class SynonymQuery extends Query {
             }
 
             PriorityQueue<SubIterator> pq =
-                new PriorityQueue<SubIterator>(impacts.length) {
+                new PriorityQueue<>(impacts.length) {
                   @Override
                   protected boolean lessThan(SubIterator a, SubIterator b) {
                     if (a.current == null) { // means iteration is finished
@@ -626,18 +632,14 @@ public final class SynonymQuery extends Query {
   }
 
   private static class TermAndBoost {
-    final Term term;
+    final BytesRef term;
     final float boost;
 
-    TermAndBoost(Term term, float boost) {
+    TermAndBoost(BytesRef term, float boost) {
       this.term = term;
       this.boost = boost;
     }
 
-    Term getTerm() {
-      return term;
-    }
-
     @Override
     public boolean equals(Object o) {
       if (this == o) return true;
diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
index 6012c051ceb..3ad17dca5ce 100644
--- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
@@ -65,13 +65,13 @@ public class QueryBuilder {
   /** Wraps a term and boost */
   public static class TermAndBoost {
     /** the term */
-    public final Term term;
+    public final BytesRef term;
     /** the boost */
     public final float boost;
 
     /** Creates a new TermAndBoost */
-    public TermAndBoost(Term term, float boost) {
-      this.term = term;
+    public TermAndBoost(BytesRef term, float boost) {
+      this.term = BytesRef.deepCopyOf(term);
       this.boost = boost;
     }
   }
@@ -390,21 +390,24 @@ public class QueryBuilder {
     stream.reset();
     List<TermAndBoost> terms = new ArrayList<>();
     while (stream.incrementToken()) {
-      terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
+      terms.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
     }
 
-    return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
+    return newSynonymQuery(field, terms.toArray(TermAndBoost[]::new));
   }
 
   protected void add(
-      BooleanQuery.Builder q, List<TermAndBoost> current, BooleanClause.Occur operator) {
+      String field,
+      BooleanQuery.Builder q,
+      List<TermAndBoost> current,
+      BooleanClause.Occur operator) {
     if (current.isEmpty()) {
       return;
     }
     if (current.size() == 1) {
-      q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator);
+      q.add(newTermQuery(new Term(field, current.get(0).term), current.get(0).boost), operator);
     } else {
-      q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator);
+      q.add(newSynonymQuery(field, current.toArray(TermAndBoost[]::new)), operator);
     }
   }
 
@@ -421,13 +424,12 @@ public class QueryBuilder {
     stream.reset();
     while (stream.incrementToken()) {
       if (posIncrAtt.getPositionIncrement() != 0) {
-        add(q, currentQuery, operator);
+        add(field, q, currentQuery, operator);
         currentQuery.clear();
       }
-      currentQuery.add(
-          new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
+      currentQuery.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
     }
-    add(q, currentQuery, operator);
+    add(field, q, currentQuery, operator);
 
     return q.build();
   }
@@ -518,7 +520,7 @@ public class QueryBuilder {
       if (graph.hasSidePath(start)) {
         final Iterator<TokenStream> sidePathsIterator = graph.getFiniteStrings(start, end);
         Iterator<Query> queries =
-            new Iterator<Query>() {
+            new Iterator<>() {
               @Override
               public boolean hasNext() {
                 return sidePathsIterator.hasNext();
@@ -544,14 +546,14 @@ public class QueryBuilder {
                     s -> {
                       TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
                       BoostAttribute b = s.addAttribute(BoostAttribute.class);
-                      return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost());
+                      return new TermAndBoost(t.getBytesRef(), b.getBoost());
                     })
                 .toArray(TermAndBoost[]::new);
         assert terms.length > 0;
         if (terms.length == 1) {
-          positionalQuery = newTermQuery(terms[0].term, terms[0].boost);
+          positionalQuery = newTermQuery(new Term(field, terms[0].term), terms[0].boost);
         } else {
-          positionalQuery = newSynonymQuery(terms);
+          positionalQuery = newSynonymQuery(field, terms);
         }
       }
       if (positionalQuery != null) {
@@ -599,8 +601,8 @@ public class QueryBuilder {
    *
    * @return new Query instance
    */
-  protected Query newSynonymQuery(TermAndBoost[] terms) {
-    SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field());
+  protected Query newSynonymQuery(String field, TermAndBoost[] terms) {
+    SynonymQuery.Builder builder = new SynonymQuery.Builder(field);
     for (TermAndBoost t : terms) {
       builder.addTerm(t.term, t.boost);
     }