You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2008/01/24 15:36:49 UTC
svn commit: r614884 - /lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java

Author: gsingers
Date: Thu Jan 24 06:36:46 2008
New Revision: 614884

URL: http://svn.apache.org/viewvc?rev=614884&view=rev
Log:
LUCENE-1127: added couple of convenience methods to TokenSources

Modified:
    lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java

Modified: lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=614884&r1=614883&r2=614884&view=diff
==============================================================================
--- lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/java/trunk/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Thu Jan 24 06:36:46 2008
@@ -43,6 +43,36 @@
  */
 public class TokenSources
 {
+  /**
+   * A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
+   * using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream.  This is useful when
+   * you already have the document, but would prefer to use the vector first.
+   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try and get the vector from
+   * @param docId The docId to retrieve.
+   * @param field The field to retrieve on the document
+   * @param doc The document to fall back on
+   * @param analyzer The analyzer to use for creating the TokenStream if the vector doesn't exist
+   * @return The {@link org.apache.lucene.analysis.TokenStream} for the {@link org.apache.lucene.document.Fieldable} on the {@link org.apache.lucene.document.Document}
+   * @throws IOException if there was an error loading
+   */
+  public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Document doc, Analyzer analyzer) throws IOException{
+    TokenStream ts=null;
+
+		TermFreqVector tfv=(TermFreqVector) reader.getTermFreqVector(docId,field);
+		if(tfv!=null)
+		{
+		    if(tfv instanceof TermPositionVector)
+		    {
+		        ts=getTokenStream((TermPositionVector) tfv);
+		    }
+		}
+		//No token info stored so fall back to analyzing raw content
+		if(ts==null)
+		{
+		    ts=getTokenStream(doc,field,analyzer);
+		}
+		return ts;
+  }
     /**
      * A convenience method that tries a number of approaches to getting a token stream.
      * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
@@ -219,15 +249,21 @@
     //convenience method
     public static TokenStream getTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
     {
-		Document doc=reader.document(docId);
-		String contents=doc.get(field);
+		  Document doc=reader.document(docId);
+		  return getTokenStream(doc, field, analyzer);
+    }
+    
+  public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer){
+    String contents=doc.get(field);
 		if(contents==null)
 		{
-		    throw new IllegalArgumentException("Field "+field +" in document #"+docId+ " is not stored and cannot be analyzed");
+		    throw new IllegalArgumentException("Field "+field +" in document is not stored and cannot be analyzed");
 		}
-        return analyzer.tokenStream(field,new StringReader(contents));
-    }
-    
-    
+        return getTokenStream(field, contents, analyzer);
+  }
+  //conevenience method
+  public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
+    return analyzer.tokenStream(field,new StringReader(contents));
+  }
 
 }