You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ma...@apache.org on 2009/04/10 04:09:47 UTC

svn commit: r763856 - in /lucene/java/trunk/contrib/highlighter/src: java/org/apache/lucene/search/highlight/ test/org/apache/lucene/search/highlight/

Author: markrmiller
Date: Fri Apr 10 02:09:46 2009
New Revision: 763856

URL: http://svn.apache.org/viewvc?rev=763856&view=rev
Log:
Add ConstantScore highlighting support to SpanScorer

Modified:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java?rev=763856&r1=763855&r2=763856&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java Fri Apr 10 02:09:46 2009
@@ -9,6 +9,7 @@
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.ConstantScoreRangeQuery;
 import org.apache.lucene.search.Query;
 
 
@@ -38,7 +39,25 @@
    */
   public SpanScorer(Query query, String field,
     CachingTokenFilter cachingTokenFilter) throws IOException {
-    init(query, field, cachingTokenFilter, null);
+    init(query, field, cachingTokenFilter, null, false);
+  }
+  
+
+  /**
+   * @param query
+   *          Query to use for highlighting
+   * @param field
+   *          Field to highlight - pass null to ignore fields
+   * @param tokenStream
+   *          of source text to be highlighted
+   * @param expandMultiTermQuery
+   *          rewrite multi-term queries against a single doc memory index to
+   *          create boolean queries
+   * @throws IOException
+   */
+  public SpanScorer(Query query, String field,
+    CachingTokenFilter cachingTokenFilter, boolean expandMultiTermQuery) throws IOException {
+    init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
   }
 
   /**
@@ -54,7 +73,26 @@
   public SpanScorer(Query query, String field,
     CachingTokenFilter cachingTokenFilter, IndexReader reader)
     throws IOException {
-    init(query, field, cachingTokenFilter, reader);
+    init(query, field, cachingTokenFilter, reader, false);
+  }
+  
+  /**
+   * @param query
+   *            Query to use for highlighting
+   * @param field
+   *            Field to highlight - pass null to ignore fields
+   * @param tokenStream
+   *            of source text to be highlighted
+   * @param reader
+   * @param expandMultiTermQuery
+   *          rewrite multi-term queries against a single doc memory index to
+   *          create boolean queries
+   * @throws IOException
+   */
+  public SpanScorer(Query query, String field,
+    CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
+    throws IOException {
+    init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
   }
 
   /**
@@ -64,7 +102,17 @@
     CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField)
     throws IOException {
     this.defaultField = defaultField.intern();
-    init(query, field, cachingTokenFilter, reader);
+    init(query, field, cachingTokenFilter, reader, false);
+  }
+  
+  /**
+   * As above, but with ability to pass in an <tt>IndexReader</tt>
+   */
+  public SpanScorer(Query query, String field,
+    CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField, boolean expandMultiTermQuery)
+    throws IOException {
+    this.defaultField = defaultField.intern();
+    init(query, field, cachingTokenFilter, reader, expandMultiTermQuery);
   }
 
   /**
@@ -73,7 +121,16 @@
   public SpanScorer(Query query, String field,
     CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException {
     this.defaultField = defaultField.intern();
-    init(query, field, cachingTokenFilter, null);
+    init(query, field, cachingTokenFilter, null, false);
+  }
+  
+  /**
+   * @param defaultField - The default field for queries with the field name unspecified
+   */
+  public SpanScorer(Query query, String field,
+    CachingTokenFilter cachingTokenFilter, String defaultField, boolean expandMultiTermQuery) throws IOException {
+    this.defaultField = defaultField.intern();
+    init(query, field, cachingTokenFilter, null, expandMultiTermQuery);
   }
 
   /**
@@ -165,13 +222,13 @@
    * @throws IOException
    */
   private void init(Query query, String field,
-    CachingTokenFilter cachingTokenFilter, IndexReader reader)
+    CachingTokenFilter cachingTokenFilter, IndexReader reader, boolean expandMultiTermQuery)
     throws IOException {
     WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
       : new WeightedSpanTermExtractor(defaultField);
     
     qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery);
-
+    qse.setExpandMultiTermQuery(expandMultiTermQuery);
     if (reader == null) {
       this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
           cachingTokenFilter, field);
@@ -183,6 +240,8 @@
 
   /**
    * @return whether ConstantScoreRangeQuerys are set to be highlighted
+   * @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
+   *             constructor option to expand MultiTerm queries.
    */
   public static boolean isHighlightCnstScrRngQuery() {
     return highlightCnstScrRngQuery;
@@ -197,10 +256,13 @@
   }
 
   /**
-   * Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be
-   * highlighted if you rewrite the query first. Must be called before SpanScorer construction.
+   * Turns highlighting of ConstantScoreRangeQuery on/off.
+   * ConstantScoreRangeQuerys cannot be highlighted if you rewrite the query
+   * first. Must be called before SpanScorer construction.
    * 
    * @param highlightCnstScrRngQuery
+   * @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
+   *             constructor option to expand MultiTerm queries.
    */
   public static void setHighlightCnstScrRngQuery(boolean highlight) {
     highlightCnstScrRngQuery = highlight;

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=763856&r1=763855&r2=763856&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Fri Apr 10 02:09:46 2009
@@ -38,11 +38,16 @@
 import org.apache.lucene.search.ConstantScoreRangeQuery;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RangeQuery;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
@@ -59,6 +64,7 @@
   private Map readers = new HashMap(10); // Map<String, IndexReader>
   private String defaultField;
   private boolean highlightCnstScrRngQuery;
+  private boolean expandMultiTermQuery;
 
   public WeightedSpanTermExtractor() {
   }
@@ -131,6 +137,14 @@
         extract((Query) iterator.next(), disjunctTerms);
       }
       terms.putAll(disjunctTerms);
+    } else if (query instanceof MultiTermQuery && (highlightCnstScrRngQuery || expandMultiTermQuery)) {
+      MultiTermQuery mtq = ((MultiTermQuery)query);
+      if(mtq.getConstantScoreRewrite()) {
+        query = copyMultiTermQuery(mtq);
+        mtq.setConstantScoreRewrite(false);
+      }
+      IndexReader ir = getReaderForField(fieldName);
+      extract(query.rewrite(ir), terms);
     } else if (query instanceof MultiPhraseQuery) {
       final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
       final List termArrays = mpq.getTermArrays();
@@ -179,27 +193,7 @@
         sp.setBoost(query.getBoost());
         extractWeightedSpanTerms(terms, sp);
       }
-    } else if (highlightCnstScrRngQuery && query instanceof ConstantScoreRangeQuery) {
-      ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query;
-      Term lower = new Term(fieldName, q.getLowerVal());
-      Term upper = new Term(fieldName, q.getUpperVal());
-      FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName));
-      try {
-        TermEnum te = fir.terms(lower);
-        BooleanQuery bq = new BooleanQuery();
-        do {
-          Term term = te.term();
-          if (term != null && upper.compareTo(term) >= 0) {
-            bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
-          } else {
-            break;
-          }
-        } while (te.next());
-        extract(bq, terms);
-      } finally {
-        fir.close();
-      }
-    } 
+    }
   }
 
   /**
@@ -425,10 +419,19 @@
     return terms;
   }
 
+  /**
+   * @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use
+   *             getExpandMultiTermQuery instead.
+   */
   public boolean isHighlightCnstScrRngQuery() {
     return highlightCnstScrRngQuery;
   }
-
+  
+  /**
+   * @param highlightCnstScrRngQuery
+   * @deprecated {@link ConstantScoreRangeQuery} is deprecated. Use the
+   *             setExpandMultiTermQuery option.
+   */
   public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) {
     this.highlightCnstScrRngQuery = highlightCnstScrRngQuery;
   }
@@ -460,4 +463,35 @@
     }
     
   }
+  
+  private Query copyMultiTermQuery(MultiTermQuery query) {
+    if(query instanceof RangeQuery) {
+      RangeQuery q = (RangeQuery)query;
+      q.setBoost(query.getBoost());
+      return new RangeQuery(q.getField(), q.getLowerTermText(), q.getUpperTermText(), q.includesLower(), q.includesUpper());
+    } else if(query instanceof WildcardQuery) {
+      Query q = new WildcardQuery(query.getTerm());
+      q.setBoost(query.getBoost());
+      return q;
+    } else if(query instanceof PrefixQuery) {
+      Query q = new PrefixQuery(query.getTerm());
+      q.setBoost(q.getBoost());
+      return q;
+    } else if(query instanceof FuzzyQuery) {
+      FuzzyQuery q = (FuzzyQuery)query;
+      q.setBoost(q.getBoost());
+      return new FuzzyQuery(q.getTerm(), q.getMinSimilarity(), q.getPrefixLength());
+    }
+    
+    return query;
+  }
+  
+  
+  public boolean getExpandMultiTermQuery() {
+    return expandMultiTermQuery;
+  }
+
+  public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
+    this.expandMultiTermQuery = expandMultiTermQuery;
+  }
 }

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html?rev=763856&r1=763855&r2=763856&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html Fri Apr 10 02:09:46 2009
@@ -16,12 +16,19 @@
 matching Spans are recorded with the respective WeightedSpanTerms and these positions are 
 then used to filter possible Token matches during scoring.
 </p>
+<p>
+Unlike the QueryScorer, you do not want to rewrite the query first with the SpanScorer for
+multi term query handling ie wildcard, fuzzy, range.
+The SpanScorer constructors provide an option to enable the highlighting of multi-term queries.
+If this option is enabled, the SpanScorer will rewrite the query against a single doc index
+containing the doc to be highlighted, rather than against the full index. If you do rewrite the 
+query first, certain multi-term queries may not highlight correctly.
+</p>
 <h2>Example Usage</h2>
 
 <pre>
 	IndexSearcher searcher = new IndexSearcher(ramDir);
 	Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
-	query = query.rewrite(reader); //required to expand search terms
 	Hits hits = searcher.search(query);
 
 	for (int i = 0; i &lt; hits.length(); i++)
@@ -29,7 +36,7 @@
 		String text = hits.doc(i).get(FIELD_NAME);
 		CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
                         FIELD_NAME, new StringReader(text)));
-        Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
+        Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream, true));
         tokenStream.reset();
         
         // Get 3 best fragments and seperate with a "..."

Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=763856&r1=763855&r2=763856&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Fri Apr 10 02:09:46 2009
@@ -63,6 +63,7 @@
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
 import org.apache.lucene.search.spans.SpanNearQuery;
@@ -437,7 +438,7 @@
       public void run() throws Exception {
         numHighlights = 0;
         doSearching("Kinnedy~");
-        doStandardHighlights(analyzer, hits, query, HighlighterTest.this);
+        doStandardHighlights(analyzer, hits, query, HighlighterTest.this, true);
         assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
             numHighlights == 5);
       }
@@ -539,6 +540,45 @@
     assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
         numHighlights == 5);
   }
+  
+  public void testConstantScoreMultiTermQuery() throws Exception {
+
+    numHighlights = 0;
+
+    query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
+    ((WildcardQuery)query).setConstantScoreRewrite(true);
+    searcher = new IndexSearcher(ramDir);
+    // can't rewrite ConstantScore if you want to highlight it -
+    // it rewrites to ConstantScoreQuery which cannot be highlighted
+    // query = unReWrittenQuery.rewrite(reader);
+    System.out.println("Searching for: " + query.toString(FIELD_NAME));
+    hits = searcher.search(query);
+
+    for (int i = 0; i < hits.length(); i++) {
+      String text = hits.doc(i).get(HighlighterTest.FIELD_NAME);
+      int maxNumFragmentsRequired = 2;
+      String fragmentSeparator = "...";
+      SpanScorer scorer = null;
+      TokenStream tokenStream = null;
+
+      tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
+          new StringReader(text)));
+      
+      scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME, (CachingTokenFilter) tokenStream, true);
+
+      Highlighter highlighter = new Highlighter(this, scorer);
+
+      ((CachingTokenFilter) tokenStream).reset();
+
+      highlighter.setTextFragmenter(new SimpleFragmenter(20));
+
+      String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+          fragmentSeparator);
+      System.out.println("\t" + result);
+    }
+    assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+        numHighlights == 5);
+  }
 
   public void testGetBestFragmentsPhrase() throws Exception {
     TestHighlightRunner helper = new TestHighlightRunner() {
@@ -1565,6 +1605,11 @@
     }
 
     void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter)
+    throws Exception {
+      doStandardHighlights(analyzer, hits, query, formatter, false);
+    }
+    
+    void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter, boolean expandMT)
         throws Exception {
 
       for (int i = 0; i < hits.length(); i++) {
@@ -1577,7 +1622,7 @@
           tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
               new StringReader(text)));
           scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME,
-              (CachingTokenFilter) tokenStream);
+              (CachingTokenFilter) tokenStream, expandMT);
         } else if (mode == STANDARD) {
           scorer = new QueryScorer(query);
           tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));