You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ma...@apache.org on 2008/11/13 01:01:44 UTC

svn commit: r713569 - in /lucene/java/trunk/contrib/highlighter/src: java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java test/org/apache/lucene/search/highlight/HighlighterTest.java

Author: markrmiller
Date: Wed Nov 12 16:01:43 2008
New Revision: 713569

URL: http://svn.apache.org/viewvc?rev=713569&view=rev
Log:
LUCENE-1389: SimpleSpanFragmenter can create very short fragments

Modified:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java?rev=713569&r1=713568&r2=713569&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java Wed Nov 12 16:01:43 2008
@@ -33,6 +33,7 @@
   private int position = -1;
   private SpanScorer spanScorer;
   private int waitForPos = -1;
+  private int textSize;
 
   /**
    * @param spanscorer SpanScorer that was used to score hits
@@ -70,14 +71,14 @@
       for (int i = 0; i < positionSpans.size(); i++) {
         if (((PositionSpan) positionSpans.get(i)).start == position) {
           waitForPos = ((PositionSpan) positionSpans.get(i)).end + 1;
-
-          return true;
+          break;
         }
       }
     }
 
-    boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags);
-
+    boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags)
+        && (textSize - token.endOffset()) >= (fragmentSize >>> 1);
+    
     if (isNewFrag) {
       currentNumFrags++;
     }
@@ -89,7 +90,8 @@
    * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
    */
   public void start(String originalText) {
-    position = 0;
+    position = -1;
     currentNumFrags = 1;
+    textSize = originalText.length();
   }
 }

Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=713569&r1=713568&r2=713569&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Wed Nov 12 16:01:43 2008
@@ -242,6 +242,46 @@
     }
   }
   
+  public void testSimpleSpanFragmenter() throws Exception {
+    doSearching("\"piece of text that is very long\"");
+
+    int maxNumFragmentsRequired = 2;
+
+    for (int i = 0; i < hits.length(); i++) {
+      String text = hits.doc(i).get(FIELD_NAME);
+      CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer
+          .tokenStream(FIELD_NAME, new StringReader(text)));
+      SpanScorer spanscorer = new SpanScorer(query, FIELD_NAME, tokenStream);
+      Highlighter highlighter = new Highlighter(this, spanscorer);
+      highlighter.setTextFragmenter(new SimpleSpanFragmenter(spanscorer, 5));
+      tokenStream.reset();
+
+      String result = highlighter.getBestFragments(tokenStream, text,
+          maxNumFragmentsRequired, "...");
+      System.out.println("\t" + result);
+
+    }
+    
+    doSearching("\"been shot\"");
+
+    maxNumFragmentsRequired = 2;
+
+    for (int i = 0; i < hits.length(); i++) {
+      String text = hits.doc(i).get(FIELD_NAME);
+      CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer
+          .tokenStream(FIELD_NAME, new StringReader(text)));
+      SpanScorer spanscorer = new SpanScorer(query, FIELD_NAME, tokenStream);
+      Highlighter highlighter = new Highlighter(this, spanscorer);
+      highlighter.setTextFragmenter(new SimpleSpanFragmenter(spanscorer, 20));
+      tokenStream.reset();
+
+      String result = highlighter.getBestFragments(tokenStream, text,
+          maxNumFragmentsRequired, "...");
+      System.out.println("\t" + result);
+
+    }
+  }
+  
   // position sensitive query added after position insensitive query
   public void testPosTermStdTerm() throws Exception {
     doSearching("y \"x y z\"");