You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2009/06/23 01:06:46 UTC

svn commit: r787437 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/document/ src/test/org/apache/lucene/index/

Author: yonik
Date: Mon Jun 22 23:06:46 2009
New Revision: 787437

URL: http://svn.apache.org/viewvc?rev=787437&view=rev
Log:
LUCENE-1699: make Field.tokenStream usable with other stored field mechanisms

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java
    lucene/java/trunk/src/java/org/apache/lucene/document/Field.java
    lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=787437&r1=787436&r2=787437&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Mon Jun 22 23:06:46 2009
@@ -424,6 +424,11 @@
 
 28. LUCENE-1405: Added support for Ant resource collections in contrib/ant
     <index> task.  (Przemyslaw Sztoch via Erik Hatcher)
+
+29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing
+    in conjunction with any other ways to specify stored field values,
+    currently binary or string values.  (yonik)
+    
     
 Optimizations
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java?rev=787437&r1=787436&r2=787437&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/AbstractField.java Mon Jun 22 23:06:46 2009
@@ -16,7 +16,8 @@
  */
 
 import org.apache.lucene.search.PhraseQuery; // for javadocs
-import org.apache.lucene.search.spans.SpanQuery; // for javadocs
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.analysis.TokenStream; // for javadocs
 
 
 /**
@@ -38,9 +39,11 @@
   protected boolean lazy = false;
   protected boolean omitTermFreqAndPositions = false;
   protected float boost = 1.0f;
-  // the one and only data object for all different kind of field values
+  // the data object for all different kind of field values
   protected Object fieldsData = null;
-  //length/offset for all primitive types
+  // pre-analyzed tokenStream for indexed fields
+  protected TokenStream tokenStream;
+  // length/offset for all primitive types
   protected int binaryLength;
   protected int binaryOffset;
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/document/Field.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/Field.java?rev=787437&r1=787436&r2=787437&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/Field.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/Field.java Mon Jun 22 23:06:46 2009
@@ -94,7 +94,7 @@
     /** Expert: Index the field's value without an Analyzer,
      * and also disable the storing of norms.  Note that you
      * can also separately enable/disable norms by calling
-     * {@link #setOmitNorms}.  No norms means that
+     * {@link Field#setOmitNorms}.  No norms means that
      * index-time field and document boosting and field
      * length normalization are disabled.  The benefit is
      * less memory usage as norms take up one byte of RAM
@@ -159,19 +159,19 @@
   }
   
   
-  /** The value of the field as a String, or null.  If null, the Reader value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a String, or null.  If null, the Reader value or
+   * binary value is used.  Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set. */
   public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
   
-  /** The value of the field as a Reader, or null.  If null, the String value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a Reader, or null.  If null, the String value or
+   * binary value is used.  Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set. */
   public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
   
   /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
+   * or String value is used. Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set.
    * @deprecated This method must allocate a new byte[] if
    * the {@link AbstractField#getBinaryOffset()} is non-zero
    * or {@link AbstractField#getBinaryLength()} is not the
@@ -191,10 +191,9 @@
     return ret;    
   }
   
-  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
-   * String value, or binary value is used. Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
-  public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+  /** The TokesStream for this field to be used when indexing, or null.  If null, the Reader value
+   * or String value is analyzed to produce the indexed tokens. */
+  public TokenStream tokenStreamValue()   { return tokenStream; }
   
 
   /** <p>Expert: change the value of this field.  This can
@@ -204,10 +203,7 @@
    *  a single {@link Document} instance is re-used as
    *  well.  This helps most on small documents.</p>
    * 
-   *  <p>Note that you should only use this method after the
-   *  Field has been consumed (ie, the {@link Document}
-   *  containing this Field has been added to the index).
-   *  Also, each Field instance should only be used once
+   *  <p>Each Field instance should only be used once
    *  within a single {@link Document} instance.  See <a
    *  href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
    *  for details.</p> */
@@ -250,7 +246,8 @@
   }
   
   
-  /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
+  /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>.
+   * @deprecated use {@link #setTokenStream} */
   public void setValue(TokenStream value) {
     if (isBinary) {
       throw new IllegalArgumentException("cannot set a TokenStream value on a binary field");
@@ -258,7 +255,16 @@
     if (isStored) {
       throw new IllegalArgumentException("cannot set a TokenStream value on a stored field");
     }
-    fieldsData = value;
+    fieldsData = null;
+    tokenStream = value;
+  }
+
+  /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
+   *  May be combined with stored values from stringValue() or binaryValue() */
+  public void setTokenStream(TokenStream tokenStream) {
+    this.isIndexed = true;
+    this.isTokenized = true;
+    this.tokenStream = tokenStream;
   }
 
   /**
@@ -459,8 +465,9 @@
       throw new NullPointerException("tokenStream cannot be null");
     
     this.name = name.intern();        // field names are interned
-    this.fieldsData = tokenStream;
-    
+    this.fieldsData = null;
+    this.tokenStream = tokenStream;
+
     this.isStored = false;
     this.isCompressed = false;
     

Modified: lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java?rev=787437&r1=787436&r2=787437&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/Fieldable.java Mon Jun 22 23:06:46 2009
@@ -74,36 +74,41 @@
    */
   String name();
 
-  /** The value of the field as a String, or null.  If null, the Reader value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a String, or null.
+   * <p>
+   * For indexing, if isStored()==true, the stringValue() will be used as the stored field value
+   * unless isBinary()==true, in which case binaryValue() will be used.
+   *
+   * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
+   * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
+   * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
+   */
   public String stringValue();
   
-  /** The value of the field as a Reader, or null.  If null, the String value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
+   * @see #stringValue()
+   */
   public Reader readerValue();
   
-  /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field in Binary, or null.
+   * @see #stringValue()
+   */
   public byte[] binaryValue();
   
-  /** The value of the field as a TokenStream, or null.  If null, the Reader value,
-   * String value, or binary value is used. Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The TokenStream for this field to be used when indexing, or null.
+   * @see #stringValue()
+   */
   public TokenStream tokenStreamValue();
 
-  /** True iff the value of the field is to be stored in the index for return
-    with search hits.  It is an error for this to be true if a field is
-    Reader-valued. */
+  /** True if the value of the field is to be stored in the index for return
+    with search hits. */
   boolean  isStored();
 
-  /** True iff the value of the field is to be indexed, so that it may be
+  /** True if the value of the field is to be indexed, so that it may be
     searched on. */
   boolean  isIndexed();
 
-  /** True iff the value of the field should be tokenized as text prior to
+  /** True if the value of the field should be tokenized as text prior to
     indexing.  Un-tokenized fields are indexed as a single word and may not be
     Reader-valued. */
   boolean  isTokenized();
@@ -111,7 +116,7 @@
   /** True if the value of the field is stored and compressed within the index */
   boolean  isCompressed();
 
-  /** True iff the term or terms used to index this field are stored as a term
+  /** True if the term or terms used to index this field are stored as a term
    *  vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
    *  These methods do not provide access to the original content of the field,
    *  only to terms used to index it. If the original content must be
@@ -122,17 +127,17 @@
   boolean isTermVectorStored();
 
   /**
-   * True iff terms are stored as term vector together with their offsets 
+   * True if terms are stored as term vector together with their offsets 
    * (start and end positon in source text).
    */
   boolean isStoreOffsetWithTermVector();
 
   /**
-   * True iff terms are stored as term vector together with their token positions.
+   * True if terms are stored as term vector together with their token positions.
    */
   boolean isStorePositionWithTermVector();
 
-  /** True iff the value of the filed is stored as binary */
+  /** True if the value of the field is stored as binary */
   boolean  isBinary();
 
   /** True if norms are omitted for this indexed field */

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=787437&r1=787436&r2=787437&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Mon Jun 22 23:06:46 2009
@@ -17,11 +17,7 @@
  * limitations under the License.
  */
 
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.Reader;
+import java.io.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -4350,4 +4346,66 @@
     t.join();
     assertFalse(t.failed);
   }
+
+
+  public void testIndexStoreCombos() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+    byte[] b = new byte[50];
+    for(int i=0;i<50;i++)
+      b[i] = (byte) (i+77);
+
+    Document doc = new Document();
+    Field f = new Field("binary", b, 10, 17, Field.Store.YES);
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
+    Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
+    doc.add(f);
+    doc.add(f2);
+    w.addDocument(doc);
+    
+    // add 2 docs to test in-memory merging
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
+    w.addDocument(doc);
+  
+    // force segment flush so we can force a segment merge with doc3 later.
+    w.commit();
+
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));
+
+    w.addDocument(doc);
+    w.commit();
+    w.optimize();   // force segment merge.
+
+    IndexReader ir = IndexReader.open(dir);
+    doc = ir.document(0);
+    f = doc.getField("binary");
+    b = f.getBinaryValue();
+    assertTrue(b != null);
+    assertEquals(17, b.length, 17);
+    assertEquals(87, b[0]);
+
+    assertTrue(ir.document(0).getFieldable("binary").isBinary());
+    assertTrue(ir.document(1).getFieldable("binary").isBinary());
+    assertTrue(ir.document(2).getFieldable("binary").isBinary());
+    
+    assertEquals("value", ir.document(0).get("string"));
+    assertEquals("value", ir.document(1).get("string"));
+    assertEquals("value", ir.document(2).get("string"));
+
+
+    // test that the terms were indexed.
+    assertTrue(ir.termDocs(new Term("binary","doc1field1")).next());
+    assertTrue(ir.termDocs(new Term("binary","doc2field1")).next());
+    assertTrue(ir.termDocs(new Term("binary","doc3field1")).next());
+    assertTrue(ir.termDocs(new Term("string","doc1field2")).next());
+    assertTrue(ir.termDocs(new Term("string","doc2field2")).next());
+    assertTrue(ir.termDocs(new Term("string","doc3field2")).next());
+
+    ir.close();
+    dir.close();
+
+  }
 }