You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2006/07/15 17:12:45 UTC

svn commit: r422248 - /incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java

Author: yonik
Date: Sat Jul 15 08:12:45 2006
New Revision: 422248

URL: http://svn.apache.org/viewvc?rev=422248&view=rev
Log:
order tokens by startOffset when highlighting

Modified:
    incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java

Modified: incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java
URL: http://svn.apache.org/viewvc/incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java?rev=422248&r1=422247&r2=422248&view=diff
==============================================================================
--- incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java (original)
+++ incubator/solr/trunk/src/java/org/apache/solr/util/SolrPluginUtils.java Sat Jul 15 08:12:45 2006
@@ -78,14 +78,7 @@
 import java.util.logging.Level;
 import java.util.logging.Handler;
 
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Collection;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.HashMap;
+import java.util.*;
 import java.util.regex.Pattern;
 import java.io.IOException;
 import java.io.StringReader;
@@ -393,8 +386,10 @@
               searcher.getReader(), docId, fieldName);
           } catch (IllegalArgumentException e) {
             // fall back to analyzer
-            tstream = searcher.getSchema().getAnalyzer().tokenStream(
-             fieldName, new StringReader(docTexts[0]));
+            tstream = new TokenOrderingFilter(
+                    searcher.getSchema().getAnalyzer().tokenStream(
+                      fieldName, new StringReader(docTexts[0])),
+                    10);
           }
           frag = highlighter.getBestTextFragments(
             tstream, docTexts[0], false, numFragments);
@@ -404,7 +399,7 @@
           MultiValueTokenStream tstream;
           tstream = new MultiValueTokenStream(fieldName,
                                               docTexts,
-                                              searcher.getSchema().getAnalyzer());
+                                              searcher.getSchema().getAnalyzer(), true);
           frag = highlighter.getBestTextFragments(
             tstream, tstream.asSingleValue(), false, numFragments);
         }
@@ -877,6 +872,7 @@
   private int curIndex;                  // next index into the values array
   private int curOffset;                 // offset into concatenated string
   private TokenStream currentStream;     // tokenStream currently being iterated
+  private boolean orderTokenOffsets;
 
   /** Constructs a TokenStream for consecutively-analyzed field values
    *
@@ -885,7 +881,7 @@
    * @param analyzer analyzer instance
    */
   public MultiValueTokenStream(String fieldName, String[] values, 
-                               Analyzer analyzer) {
+                               Analyzer analyzer, boolean orderTokenOffsets) {
     this.fieldName = fieldName;
     this.values = values;
     this.analyzer = analyzer;
@@ -903,6 +899,7 @@
       if(curIndex < values.length) {
         currentStream = analyzer.tokenStream(fieldName, 
                                              new StringReader(values[curIndex]));
+        if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10);
         // add extra space between multiple values
         if(curIndex > 0) 
           extra = analyzer.getPositionIncrementGap(fieldName);
@@ -964,5 +961,48 @@
         fragOffsetAccum += token.endOffset() - fragOffsetAccum;
     }
     return isNewFrag;
+  }
+}
+
+
+/** Orders Tokens in a window first by their startOffset ascending.
+ * endOffset is currently ignored.
+ * This is meant to work around fickleness in the highlighter only.  It
+ * can mess up token positions and should not be used for indexing or querying.
+ */
+class TokenOrderingFilter extends TokenFilter {
+  private final int windowSize;
+  private final LinkedList<Token> queue = new LinkedList<Token>();
+  private boolean done=false;
+
+  protected TokenOrderingFilter(TokenStream input, int windowSize) {
+    super(input);
+    this.windowSize = windowSize;
+  }
+
+  public Token next() throws IOException {
+    while (!done && queue.size() < windowSize) {
+      Token newTok = input.next();
+      if (newTok==null) {
+        done=true;
+        break;
+      }
+
+      // reverse iterating for better efficiency since we know the
+      // list is already sorted, and most token start offsets will be too.
+      ListIterator<Token> iter = queue.listIterator(queue.size());
+      while(iter.hasPrevious()) {
+        if (newTok.startOffset() >= iter.previous().startOffset()) {
+          // insertion will be before what next() would return (what
+          // we just compared against), so move back one so the insertion
+          // will be after.
+          iter.next();
+          break;
+        }
+      }
+      iter.add(newTok);
+    }
+
+    return queue.isEmpty() ? null : queue.removeFirst();
   }
 }