You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mh...@apache.org on 2009/03/26 00:05:17 UTC
svn commit: r758460 - in /lucene/java/trunk: ./ contrib/highlighter/src/java/org/apache/lucene/search/highlight/ contrib/highlighter/src/test/org/apache/lucene/search/highlight/

Author: mharwood
Date: Wed Mar 25 23:05:14 2009
New Revision: 758460

URL: http://svn.apache.org/viewvc?rev=758460&view=rev
Log:
Fix for Lucene-1500 - new exception added to Highlighter API to handle TokenStreams with Tokens that exceed given text length

Added:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
    lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=758460&r1=758459&r2=758460&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Mar 25 23:05:14 2009
@@ -65,6 +65,10 @@
 
 11. LUCENE-1561: Renamed Field.omitTf to Field.omitTermFreqAndPositions
    (Otis Gospodnetic via Mike McCandless)
+  
+12. LUCENE-1500: Added new InvalidTokenOffsetsException to Highlighter methods
+    to denote issues when offsets in TokenStream tokens exceed the length of the
+    provided text.  (Mark Harwood)
 
 Bug fixes
 

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=758460&r1=758459&r2=758460&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Wed Mar 25 23:05:14 2009
@@ -74,9 +74,10 @@
 	 * @param fieldName Name of field used to influence analyzer's tokenization policy
 	 *
 	 * @return highlighted text fragment or null if no terms found
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
 		return getBestFragment(tokenStream, text);
@@ -96,9 +97,10 @@
 	 * @param text text to highlight terms in
 	 *
 	 * @return highlighted text fragment or null if no terms found
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragment(TokenStream tokenStream, String text)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		String[] results = getBestFragments(tokenStream,text, 1);
 		if (results.length > 0)
@@ -120,12 +122,13 @@
 	 * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
 	 * method of the same name that takes a fieldname.
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException  thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		Analyzer analyzer,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
 		return getBestFragments(tokenStream, text, maxNumFragments);
@@ -142,13 +145,14 @@
 	 * @param maxNumFragments  the maximum number of fragments.
 	 *
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		Analyzer analyzer,
 		String fieldName,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
 		return getBestFragments(tokenStream, text, maxNumFragments);
@@ -165,12 +169,13 @@
 	 * @param maxNumFragments  the maximum number of fragments.
 	 *
 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String[] getBestFragments(
 		TokenStream tokenStream,
 		String text,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
 
@@ -198,13 +203,14 @@
 	 * @param maxNumFragments
 	 * @param mergeContiguousFragments
 	 * @throws IOException
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final TextFragment[] getBestTextFragments(
 		TokenStream tokenStream,
 		String text,
 		boolean mergeContiguousFragments,
 		int maxNumFragments)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		ArrayList docFrags = new ArrayList();
 		StringBuffer newText=new StringBuffer();
@@ -230,6 +236,14 @@
 			     (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
 			     nextToken = tokenStream.next(reusableToken))
 			{
+				if(	(nextToken.endOffset()>text.length())
+					||
+					(nextToken.startOffset()>text.length())
+					)						
+				{
+					throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
+							+" exceeds length of provided text sized "+text.length());
+				}
 				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
 				{
 					//the current token is distinct from previous tokens -
@@ -452,13 +466,14 @@
 	 * @param separator  the separator used to intersperse the document fragments (typically "...")
 	 *
 	 * @return highlighted text
+	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 	 */
 	public final String getBestFragments(
 		TokenStream tokenStream,	
 		String text,
 		int maxNumFragments,
 		String separator)
-		throws IOException
+		throws IOException, InvalidTokenOffsetsException
 	{
 		String sections[] =	getBestFragments(tokenStream,text, maxNumFragments);
 		StringBuffer result = new StringBuffer();

Added: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java?rev=758460&view=auto
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java (added)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java Wed Mar 25 23:05:14 2009
@@ -0,0 +1,31 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Exception thrown if TokenStream Tokens are incompatible with provided text
+ *
+ */
+public class InvalidTokenOffsetsException extends Exception
+{
+
+	public InvalidTokenOffsetsException(String message)
+	{
+		super(message);
+	}
+
+}

Modified: lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=758460&r1=758459&r2=758460&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Wed Mar 25 23:05:14 2009
@@ -136,9 +136,10 @@
 
   /**
    * This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
+ * @throws InvalidTokenOffsetsException 
    */
   private static String highlightField(Query query, String fieldName, String text)
-      throws IOException {
+      throws IOException, InvalidTokenOffsetsException {
     CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
         fieldName, new StringReader(text)));
     // Assuming "<B>", "</B>" used to highlight
@@ -1291,7 +1292,7 @@
   private Directory dir = new RAMDirectory();
   private Analyzer a = new WhitespaceAnalyzer();
   
-  public void testWeightedTermsWithDeletes() throws IOException, ParseException {
+  public void testWeightedTermsWithDeletes() throws IOException, ParseException, InvalidTokenOffsetsException {
     makeIndex();
     deleteDocument();
     searchIndex();
@@ -1321,7 +1322,7 @@
     writer.close();
   }
   
-  private void searchIndex() throws IOException, ParseException {
+  private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException {
     String q = "t_text1:random";
     QueryParser parser = new QueryParser( "t_text1", a );
     Query query = parser.parse( q );