You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by cu...@apache.org on 2002/08/05 19:15:01 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestPositionIncrement.java

cutting     2002/08/05 10:15:00

  Modified:    src/java/org/apache/lucene/analysis Token.java
               src/java/org/apache/lucene/index DocumentWriter.java
  Added:       src/test/org/apache/lucene/search TestPositionIncrement.java
  Log:
  Added support for Token.setPositionIncrement(int).
  
  Revision  Changes    Path
  1.2       +37 -0     jakarta-lucene/src/java/org/apache/lucene/analysis/Token.java
  
  Index: Token.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/Token.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- Token.java	18 Sep 2001 16:29:50 -0000	1.1
  +++ Token.java	5 Aug 2002 17:14:59 -0000	1.2
  @@ -74,6 +74,8 @@
     int endOffset;				  // end in source text
     String type = "word";				  // lexical type
   
  +  private int positionIncrement = 1;
  +
     /** Constructs a Token with the given term text, and start & end offsets.
         The type defaults to "word." */
     public Token(String text, int start, int end) {
  @@ -89,6 +91,41 @@
       endOffset = end;
       type = typ;
     }
  +
  +  /** Set the position increment.  This determines the position of this token
  +   * relative to the previous Token in a {@link TokenStream}, used in phrase
  +   * searching.
  +   *
  +   * <p>The default value is one.
  +   *
  +   * <p>Two common uses for this are:<ul>
  +   *
  +   * <li>Set it to zero to put multiple terms in the same position.  This is
  +   * useful if, e.g., when a word has multiple stems.  This way searches for
  +   * phrases including either stem will match this occurence.  In this case,
  +   * all but the first stem's increment should be set to zero: the increment of
  +   * the first instance should be one.
  +   *
  +   * <li>Set it to values greater than one to inhibit exact phrase matches.
  +   * If, for example, one does not want phrases to match across stop words,
  +   * then one could build a stop word filter that removes stop words and also
  +   * sets the increment to the number of stop words removed before each
  +   * non-stop word.
  +   *
  +   * </ul>
  +   * @see TermPositions
  +   */
  +  public void setPositionIncrement(int positionIncrement) {
  +    if (positionIncrement < 0)
  +      throw new IllegalArgumentException
  +        ("Increment must be positive: " + positionIncrement);
  +    this.positionIncrement = positionIncrement;
  +  }
  +
  +  /** Returns the position increment of this Token.
  +   * @see #setPositionIncrement
  +   */
  +  public int getPositionIncrement() { return positionIncrement; }
   
     /** Returns the Token's term text. */
     public final String termText() { return termText; }
  
  
  
  1.3       +1 -0      jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java
  
  Index: DocumentWriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- DocumentWriter.java	29 Jul 2002 19:11:15 -0000	1.2
  +++ DocumentWriter.java	5 Aug 2002 17:15:00 -0000	1.3
  @@ -165,6 +165,7 @@
   	  TokenStream stream = analyzer.tokenStream(fieldName, reader);
   	  try {
   	    for (Token t = stream.next(); t != null; t = stream.next()) {
  +              position += (t.getPositionIncrement() - 1);
   	      addPosition(fieldName, t.termText(), position++);
   	      if (position > maxFieldLength) break;
   	    }
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java
  
  Index: TestPositionIncrement.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.search.PhraseQuery;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import java.io.Reader;
  import java.io.IOException;
  
  import junit.framework.TestCase;
  
   /** Document boost unit test.
    *
    * @author Doug Cutting
    * @version $Revision: 1.1 $
    */
  public class TestPositionIncrement extends TestCase {
    public TestPositionIncrement(String name) {
      super(name);
    }
    
  
    public static void test() throws Exception {
      Analyzer analyzer = new Analyzer() {
          public TokenStream tokenStream(String fieldName, Reader reader) {
            return new TokenStream() {
                private final String[] TOKENS = {"1", "2", "3", "4", "5"};
                private final int[] INCREMENTS = {1, 2,  1,    0,   1};
                private int i = 0;
                public Token next() throws IOException {
                  if (i == TOKENS.length)
                    return null;
                  Token t = new Token(TOKENS[i], i, i);
                  t.setPositionIncrement(INCREMENTS[i]);
                  i++;
                  return t;
                }
              };
          }
        };
      RAMDirectory store = new RAMDirectory();
      IndexWriter writer = new IndexWriter(store, analyzer, true);
      Document d = new Document();
      d.add(Field.Text("field", "bogus"));
      writer.addDocument(d);
      writer.optimize();
      writer.close();
  
      IndexSearcher searcher = new IndexSearcher(store);
      PhraseQuery q;
      Hits hits;
  
      q = new PhraseQuery();
      q.add(new Term("field","1"));
      q.add(new Term("field","2"));
      hits = searcher.search(q);
      assertEquals(0, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","2"));
      q.add(new Term("field","3"));
      hits = searcher.search(q);
      assertEquals(1, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","3"));
      q.add(new Term("field","4"));
      hits = searcher.search(q);
      assertEquals(0, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","2"));
      q.add(new Term("field","4"));
      hits = searcher.search(q);
      assertEquals(1, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","3"));
      q.add(new Term("field","5"));
      hits = searcher.search(q);
      assertEquals(1, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","4"));
      q.add(new Term("field","5"));
      hits = searcher.search(q);
      assertEquals(1, hits.length());
  
      q = new PhraseQuery();
      q.add(new Term("field","2"));
      q.add(new Term("field","5"));
      hits = searcher.search(q);
      assertEquals(0, hits.length());
  
    }
  }
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>