You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/07/18 16:39:58 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestPhrasePrefixQuery.java

otis        2002/07/18 07:39:58

  Added:       src/java/org/apache/lucene/index MultipleTermPositions.java
               src/java/org/apache/lucene/search PhrasePrefixQuery.java
               src/test/org/apache/lucene/search TestPhrasePrefixQuery.java
  Log:
  - Classes that provide support for queries such as "microsoft app*".
  Submitted by:	Anders Nielsen
  Reviewed by:	otis
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/java/org/apache/lucene/index/MultipleTermPositions.java
  
  Index: MultipleTermPositions.java
  ===================================================================
  package org.apache.lucene.index;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.IOException;
  import java.util.Arrays;
  import java.util.Iterator;
  import java.util.LinkedList;
  import java.util.List;
  
  import org.apache.lucene.util.PriorityQueue;
  
  
  /**
   * Describe class <code>MultipleTermPositions</code> here.
   *
   * @author Anders Nielsen
   * @version 1.0
   */
  public class MultipleTermPositions
      implements TermPositions
  {
      private final static class TermPositionsQueue
  	extends PriorityQueue
      {
  	TermPositionsQueue(List termPositions)
  	    throws IOException
  	{
  	    initialize(termPositions.size());
  
  	    Iterator i = termPositions.iterator();
  	    while (i.hasNext())
  	    {
  		TermPositions tp = (TermPositions)i.next();
  		if (tp.next())
  		    put(tp);
  	    }
  	}
  
  	final TermPositions peek()
  	{
  	    return (TermPositions)top();
  	}
  
  	public final boolean lessThan(Object a, Object b)
  	{
  	    return ((TermPositions)a).doc() < ((TermPositions)b).doc();
  	}
      }
  
      private final static class IntQueue
      {
  	private int _arraySize = 16;
  
  	private int _index = 0;
  	private int _lastIndex = 0;
  
  	private int[] _array = new int[_arraySize];
  
  	final void add(int i)
  	{
  	    if (_lastIndex == _arraySize)
  		growArray();
  
  	    _array[_lastIndex++] = i;
  	}
  
  	final int next()
  	{
  	    return _array[_index++];
  	}
  
  	final void sort()
  	{
  	    Arrays.sort(_array, _index, _lastIndex);
  	}
  
  	final void clear()
  	{
  	    _index = 0;
  	    _lastIndex = 0;
  	}
  
  	final int size()
  	{
  	    return (_lastIndex-_index);
  	}
  
  	private void growArray()
  	{
  	    int[] newArray = new int[_arraySize*2];
  	    System.arraycopy(_array, 0, newArray, 0, _arraySize);
  	    _array = newArray;
  	    _arraySize *= 2;
  	}
      }
  
      private int _doc;
      private int _freq;
  
      private TermPositionsQueue _termPositionsQueue;
      private IntQueue _posList;
  
      /**
       * Creates a new <code>MultipleTermPositions</code> instance.
       *
       * @param indexReader an <code>IndexReader</code> value
       * @param terms a <code>Term[]</code> value
       * @exception IOException if an error occurs
       */
      public MultipleTermPositions(IndexReader indexReader, Term[] terms)
  	throws IOException
      {
  	List termPositions = new LinkedList();
  
  	for (int i=0; i<terms.length; i++)
  	    termPositions.add(indexReader.termPositions(terms[i]));
  
  	_termPositionsQueue = new TermPositionsQueue(termPositions);
  	_posList = new IntQueue();
      }
  
      /**
       * Describe <code>next</code> method here.
       *
       * @return a <code>boolean</code> value
       * @exception IOException if an error occurs
       * @see TermDocs#next()
       */
      public final boolean next()
  	throws IOException
      {
  	if (_termPositionsQueue.size() == 0)
  	    return false;
  
  	_posList.clear();
  	_doc = _termPositionsQueue.peek().doc();
  
  	TermPositions tp;
  	do
  	{
  	    tp = _termPositionsQueue.peek();
  
  	    for (int i=0; i<tp.freq(); i++)
  		_posList.add(tp.nextPosition());
  
  	    if (tp.next())
  		_termPositionsQueue.adjustTop();
  	    else
  	    {
  		_termPositionsQueue.pop();
  		tp.close();
  	    }
  	}
  	while (_termPositionsQueue.size() > 0 && _termPositionsQueue.peek().doc() == _doc);
  
  	_posList.sort();
  	_freq = _posList.size();
  
  	return true;
      }
  
      /**
       * Describe <code>nextPosition</code> method here.
       *
       * @return an <code>int</code> value
       * @exception IOException if an error occurs
       * @see TermPositions#nextPosition()
       */
      public final int nextPosition()
  	throws IOException
      {
  	return _posList.next();
      }
  
      /**
       * Describe <code>skipTo</code> method here.
       *
       * @param target an <code>int</code> value
       * @return a <code>boolean</code> value
       * @exception IOException if an error occurs
       * @see TermDocs#skipTo(int)
       */
      public final boolean skipTo(int target)
  	throws IOException
      {
  	while (target > _termPositionsQueue.peek().doc())
  	{
  	    TermPositions tp = (TermPositions)_termPositionsQueue.pop();
  
  	    if (tp.skipTo(target))
  		_termPositionsQueue.put(tp);
  	    else
  		tp.close();
  	}
  
  	return next();
      }
  
      /**
       * Describe <code>doc</code> method here.
       *
       * @return an <code>int</code> value
       * @see TermDocs#doc()
       */
      public final int doc()
      {
  	return _doc;
      }
  
      /**
       * Describe <code>freq</code> method here.
       *
       * @return an <code>int</code> value
       * @see TermDocs#freq()
       */
      public final int freq()
      {
  	return _freq;
      }
  
      /**
       * Describe <code>close</code> method here.
       *
       * @exception IOException if an error occurs
       * @see TermDocs#close()
       */
      public final void close()
  	throws IOException
      {
  	while (_termPositionsQueue.size() > 0)
  	    ((TermPositions)_termPositionsQueue.pop()).close();
      }
  
      /**
       * Describe <code>seek</code> method here.
       *
       * @param arg0 a <code>Term</code> value
       * @exception IOException if an error occurs
       * @see TermDocs#seek(Term)
       */
      public void seek(Term arg0)
  	throws IOException
      {
  	throw new UnsupportedOperationException();
      }
  
      /**
       * Describe <code>read</code> method here.
       *
       * @param arg0 an <code>int[]</code> value
       * @param arg1 an <code>int[]</code> value
       * @return an <code>int</code> value
       * @exception IOException if an error occurs
       * @see TermDocs#read(int[], int[])
       */
      public int read(int[] arg0, int[] arg1)
  	throws IOException
      {
  	throw new UnsupportedOperationException();
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
  
  Index: PhrasePrefixQuery.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.IOException;
  import java.util.ArrayList;
  import java.util.Iterator;
  
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.MultipleTermPositions;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.TermPositions;
  import org.apache.lucene.search.Query;
  
  /**
   * PhrasePrefixQuery is a generalized version of PhraseQuery, with an added
   * method {@link add(Term[])}.
   * To use this class, to search for the phrase "Microsoft app*" first use
   * add(Term) on the term "Microsoft", then find all terms that has "app" as
   * prefix using IndexReader.terms(Term), and use PhrasePrefixQuery.add(Term[]
   * terms) to add them to the query.
   *
   * @author Anders Nielsen
   * @version 1.0
   */
  public class PhrasePrefixQuery
      extends Query
  {
      private String _field;
      private ArrayList _termArrays = new ArrayList();
  
      private float _idf = 0.0f;
      private float _weight = 0.0f;
  
      private int _slop = 0;
  
      /**
       * Creates a new <code>PhrasePrefixQuery</code> instance.
       *
       */
      public PhrasePrefixQuery()
      {
      }
  
      /**
       * Describe <code>setSlop</code> method here.
       *
       * @param s an <code>int</code> value
       */
      public void setSlop(int s)
      {
  	_slop = s;
      }
  
      /**
       * Describe <code>getSlop</code> method here.
       *
       * @return an <code>int</code> value
       */
      public int getSlop()
      {
  	return _slop;
      }
  
      /**
       * Describe <code>add</code> method here.
       *
       * @param term a <code>Term</code> value
       */
      public void add(Term term)
      {
  	add(new Term[]{term});
      }
  
      /**
       * Describe <code>add</code> method here.
       *
       * @param terms a <code>Term[]</code> value
       */
      public void add(Term[] terms)
      {
  	if (_termArrays.size() == 0)
  	    _field = terms[0].field();
  
        	for (int i=0; i<terms.length; i++)
  	{
  	    if (terms[i].field() != _field)
  	    {
  		throw new IllegalArgumentException(
  		    "All phrase terms must be in the same field (" + _field + "): "
  		    + terms[i]);
  	    }
  	}
  
  	_termArrays.add(terms);
      }
  
      Scorer scorer(IndexReader reader)
  	throws IOException
      {
      	if (_termArrays.size() == 0)  // optimize zero-term case
  	    return null;
  
  	if (_termArrays.size() == 1)  // optimize one-term case
  	{
  	    Term[] terms = (Term[])_termArrays.get(0);
  
  	    BooleanQuery boq = new BooleanQuery();
  	    for (int i=0; i<terms.length; i++)
  		boq.add(new TermQuery(terms[i]), false, false);
  
  	    return boq.scorer(reader);
      	}
  
      	TermPositions[] tps = new TermPositions[_termArrays.size()];
  	for (int i=0; i<tps.length; i++)
  	{
  	    Term[] terms = (Term[])_termArrays.get(i);
  
  	    TermPositions p;
  	    if (terms.length > 1)
  		p = new MultipleTermPositions(reader, terms);
  	    else
  		p = reader.termPositions(terms[0]);
  
  	    if (p == null)
  		return null;
  
  	    tps[i] = p;
  	}
  
  	if (_slop == 0)
  	    return new ExactPhraseScorer(tps, reader.norms(_field), _weight);
  	else
  	    return new SloppyPhraseScorer(tps, _slop, reader.norms(_field), _weight);
      }
  
      float sumOfSquaredWeights(Searcher searcher)
  	throws IOException
      {
  	Iterator i = _termArrays.iterator();
  	while (i.hasNext())
  	{
  	    Term[] terms = (Term[])i.next();
  	    for (int j=0; j<terms.length; j++)
  		_idf += Similarity.idf(terms[j], searcher);
  	}
  
  	_weight = _idf * boost;
  	return _weight * _weight;
      }
  
      void normalize(float norm)
      {
  	_weight *= norm;
  	_weight *= _idf;
      }
  
      /**
       * Describe <code>toString</code> method here.
       *
       * This method assumes that the first term in a array of terms is the
       * prefix for the whole array. That might not necessarily be so.
       *
       * @param f a <code>String</code> value
       * @return a <code>String</code> value
       */
      public final String toString(String f)
      {
  	StringBuffer buffer = new StringBuffer();
  	if (!_field.equals(f))
  	{
  	    buffer.append(_field);
  	    buffer.append(":");
  	}
  
  	buffer.append("\"");
  	Iterator i = _termArrays.iterator();
  	while (i.hasNext())
  	{
  	    Term[] terms = (Term[])i.next();
  	    buffer.append(terms[0].text() + (terms.length > 0 ? "*" : ""));
  	}
  	buffer.append("\"");
  
  	if (_slop != 0)
  	{
  	    buffer.append("~");
  	    buffer.append(_slop);
  	}
  
  	if (boost != 1.0f)
  	{
  	    buffer.append("^");
  	    buffer.append(Float.toString(boost));
  	}
  
  	return buffer.toString();
      }
  }
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/search/TestPhrasePrefixQuery.java
  
  Index: TestPhrasePrefixQuery.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.TermEnum;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.analysis.SimpleAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import junit.framework.TestCase;
  
  import java.io.IOException;
  import java.util.LinkedList;
  
  /**
   * This class tests PhrasePrefixQuery class.
   *
   * @author Otis Gospodnetic
   * @version $Id: TestPhrasePrefixQuery.java,v 1.1 2002/07/18 14:39:58 otis Exp $
   */
  public class TestPhrasePrefixQuery
      extends TestCase
  {
      public TestPhrasePrefixQuery(String name)
      {
  	super(name);
      }
  
      /**
       *
       */
      public void testPhrasePrefix()
          throws IOException
      {
          RAMDirectory indexStore = new RAMDirectory();
          IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true);
          Document doc1 = new Document();
          Document doc2 = new Document();
          Document doc3 = new Document();
  	Document doc4 = new Document();
  	doc1.add(Field.Text("body", "blueberry pie"));
          doc2.add(Field.Text("body", "blueberry pizza"));
          doc3.add(Field.Text("body", "blueberry chewing gum"));
          doc4.add(Field.Text("body", "picadelly circus"));
          writer.addDocument(doc1);
          writer.addDocument(doc2);
          writer.addDocument(doc3);
          writer.addDocument(doc4);
  	writer.optimize();
  	writer.close();
  
  	IndexSearcher searcher = new IndexSearcher(indexStore);
  
  	PhrasePrefixQuery query1 = new PhrasePrefixQuery();
  	PhrasePrefixQuery query2 = new PhrasePrefixQuery();
  	query1.add(new Term("body", "blueberry"));
  	query2.add(new Term("body", "strawberry"));
  
  	LinkedList termsWithPrefix = new LinkedList();
          IndexReader ir = IndexReader.open(indexStore);
  
  	// this TermEnum gives "picadelly", "pie" and "pizza".
          TermEnum te = ir.terms(new Term("body", "pi*"));
          do {
              termsWithPrefix.add(te.term());
          } while (te.next());
  	query1.add((Term[])termsWithPrefix.toArray(new Term[0]));
  	query2.add((Term[])termsWithPrefix.toArray(new Term[0]));
  
  	Hits result;
  	result = searcher.search(query1);
  	assertEquals(2, result.length());
  
  	result = searcher.search(query2);
  	assertEquals(0, result.length());
      }
  }
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>