You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@lucene.apache.org by Damian Gajda <dg...@caltha.pl> on 2003/12/09 20:44:00 UTC

Re: Revival of Dmitry's Term Vector patches

Hello Otis,

Here is a patch with documentation from Dmitry.

I used
cvs diff -uN

Hope it is OK now.

-- 
Damian

Re: Revival of Dmitry's Term Vector patches

Posted by Otis Gospodnetic <ot...@yahoo.com>.

Hello,

Thanks Damian!

This came inlined for me (i.e. wrapped/broken lines, etc.)
It could be just my email client (Yahoo's web mail) inlining this.

If anyone received this is as a real attachment, could you apply this
patch?

Thanks,
Otis


--- Damian Gajda <dg...@caltha.pl> wrote:
> Hello Otis,
> 
> Here is a patch with documentation from Dmitry.
> 
> I used
> cvs diff -uN
> 
> Hope it is OK now.
> 
> -- 
> Damian
> 
> > Index: src/java/org/apache/lucene/document/Field.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
> retrieving revision 1.11
> diff -u -r1.11 Field.java
> --- src/java/org/apache/lucene/document/Field.java	20 Mar 2003
> 18:28:13 -0000	1.11
> +++ src/java/org/apache/lucene/document/Field.java	9 Dec 2003
> 19:39:05 -0000
> @@ -162,6 +162,8 @@
>      is used.  Exactly one of stringValue() and readerValue() must be
> set. */
>    public Reader readerValue()	{ return readerValue; }
>  
> +  /** Create a field by specifying all parameters.
> +   */
>    public Field(String name, String string,
>  	       boolean store, boolean index, boolean token) {
>      if (name == null)
> Index: src/java/org/apache/lucene/index/FieldInfos.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v
> retrieving revision 1.4
> diff -u -r1.4 FieldInfos.java
> --- src/java/org/apache/lucene/index/FieldInfos.java	21 Oct 2003
> 17:59:16 -0000	1.4
> +++ src/java/org/apache/lucene/index/FieldInfos.java	9 Dec 2003
> 19:39:05 -0000
> @@ -68,6 +68,12 @@
>  import org.apache.lucene.store.OutputStream;
>  import org.apache.lucene.store.InputStream;
>  
> +/** Access to the Field Info file that describes document fields and
> whether or
> + *  not they are indexed. Each segment has a separate Field Info
> file. Objects
> + *  of this class is thread-safe for multiple readers, but only one
> thread can
> + *  be adding documents at a time, with no other reader or writer
> threads
> + *  accessing this object.
> + */
>  final class FieldInfos {
>    private Vector byNumber = new Vector();
>    private Hashtable byName = new Hashtable();
> @@ -94,6 +100,10 @@
>      }
>    }
>  
> +  /** Adds in information for a set of FieldInfos.
> +   *  Returns an array mapping each field number in the
> <code>names</code>
> +   *  collection to the field numbers in this one.
> +   */
>    final void add(Collection names, boolean isIndexed) {
>      Iterator i = names.iterator();
>      while (i.hasNext()) {
> @@ -101,6 +111,10 @@
>      }
>    }
>  
> +  /** If the field is not yet known, adds it. If it is known, checks
> +	*  to make sure that the isIndexed flag is the same as was given
> +	*  previously for this field. If not - throws
> IllegalStateException.
> +	*/
>    final void add(String name, boolean isIndexed) {
>      FieldInfo fi = fieldInfo(name);
>      if (fi == null)
> Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java,v
> retrieving revision 1.2
> diff -u -r1.2 SegmentMergeInfo.java
> --- src/java/org/apache/lucene/index/SegmentMergeInfo.java	21 Oct
> 2003 17:59:16 -0000	1.2
> +++ src/java/org/apache/lucene/index/SegmentMergeInfo.java	9 Dec 2003
> 19:39:06 -0000
> @@ -57,14 +57,38 @@
>  import java.io.IOException;
>  import org.apache.lucene.util.BitVector;
>  
> +/** Data container to work with SegmentMergeQueue. Represents a
> single segment
> + *  to be merged. Maintains the segment reader, TermEnum, and
> TermPositions
> + *  for this segment.
> + */
>  final class SegmentMergeInfo {
> +  /** The current term of this segment, or null if none. */
>    Term term;
> +
> +  /** Index of the 0th document from this segment in the merged
> document numbering. */
>    int base;
> +
> +  /** This segment's term enum. Do not use directly. */
>    TermEnum termEnum;
> +
> +  /** This segment's reader. Do not use directly. */
>    IndexReader reader;
> +
> +  /** Postings for the current term. */
>    TermPositions postings;
> +
> +
> +  /** Maps around deleted docs. Contains a slot for each document in
> the
> +   *  reader. Slots corresponding to deleted docs have the value of
> -1. The
> +   *  rest have their new document numbers that start at 0. This
> value
> +   *  added to <code>base</code> is the document number in the
> merged numbering.
> +   */
>    int[] docMap = null;				  // maps around deleted docs
>  
> +  /** Create a new merge info. Base <code>b</code> is a starting
> +   *  number for documents from this segment in the merged document
> +   *  numbering.
> +   */
>    SegmentMergeInfo(int b, TermEnum te, IndexReader r)
>      throws IOException {
>      base = b;
> @@ -87,6 +111,12 @@
>      }
>    }
>  
> +
> +  /** Shift to the next term on this segment's TermEnum. The new
> +   *  term becomes the current term for this segment, effecting the
> +   *  ordering of the SegmentMergeQueue. If no more terms remain
> +   *  in this segment, returns false and resets the current term to
> null.
> +   */
>    final boolean next() throws IOException {
>      if (termEnum.next()) {
>        term = termEnum.term();
> Index: src/java/org/apache/lucene/index/SegmentMergeQueue.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java,v
> retrieving revision 1.1.1.1
> diff -u -r1.1.1.1 SegmentMergeQueue.java
> --- src/java/org/apache/lucene/index/SegmentMergeQueue.java	18 Sep
> 2001 16:29:53 -0000	1.1.1.1
> +++ src/java/org/apache/lucene/index/SegmentMergeQueue.java	9 Dec
> 2003 19:39:06 -0000
> @@ -57,6 +57,10 @@
>  import java.io.IOException;
>  import org.apache.lucene.util.PriorityQueue;
>  
> +/** Priority queue of SegmentMergeInfo objects. The queue sorts the
> + *  info objects by their current term, and if the terms are equal,
> + *  by their base offset.
> + */
>  final class SegmentMergeQueue extends PriorityQueue {
>    SegmentMergeQueue(int size) {
>      initialize(size);
> Index: src/java/org/apache/lucene/index/SegmentMerger.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v
> retrieving revision 1.6
> diff -u -r1.6 SegmentMerger.java
> --- src/java/org/apache/lucene/index/SegmentMerger.java	31 Oct 2003
> 09:28:44 -0000	1.6
> +++ src/java/org/apache/lucene/index/SegmentMerger.java	9 Dec 2003
> 19:39:07 -0000
> @@ -77,20 +77,33 @@
>      "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
>    };
>    
> +  /** Create a segment merger that will merge a number of segments
> (specified
> +   *  as SegmentReaders added to this object with calls to
> <code>add</code>) into a
> +   *  single segment with the specified <code>name</code>.
> +   */
>    SegmentMerger(Directory dir, String name, boolean compoundFile) {
>      directory = dir;
>      segment = name;
>      useCompoundFile = compoundFile;
>    }
>  
> +  /** Add segment reader to be merged.
> +   *
> +   */
>    final void add(IndexReader reader) {
>      readers.addElement(reader);
>    }
>  
> +  /** Return one of the segment readers being merged.
> +   *
> +   */
>    final IndexReader segmentReader(int i) {
>      return (IndexReader)readers.elementAt(i);
>    }
>  
> +  /** Start the merge. All segment readers to be merged must have
> been added
> +   *  prior to this call.
> +   */
>    final int merge() throws IOException {
>      int value;
>      try {
> @@ -148,6 +161,9 @@
>    }
>    
>    
> +  /** Merge the field information from the segment readers.
> +   *  Called from <code>merge</code>.
> +   */
>    private final int mergeFields() throws IOException {
>      fieldInfos = new FieldInfos();		  // merge field names
>      int docCount = 0;
> @@ -181,6 +197,9 @@
>    private TermInfosWriter termInfosWriter = null;
>    private SegmentMergeQueue queue = null;
>  
> +  /** Merge the term index, frequency and proximity information
> +   *  from specified segment readers. Called from
> <code>merge</code>.
> +   */
>    private final void mergeTerms() throws IOException {
>      try {
>        freqOutput = directory.createFile(segment + ".frq");
> @@ -198,7 +217,11 @@
>      }
>    }
>  
> +  /** Merge the term index information. Called from
> <code>mergeTerms</code>.
> +   */
>    private final void mergeTermInfos() throws IOException {
> +	// Create and populate a priority queue of segments to be merged.
> +	// Segments are sorted by their top term and the base doc number in
> the merged segment.
>      queue = new SegmentMergeQueue(readers.size());
>      int base = 0;
>      for (int i = 0; i < readers.size(); i++) {
> @@ -220,13 +243,19 @@
>        Term term = match[0].term;
>        SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
>        
> +      // pop off the queue and put into match[] all segments
> +      // that have the same term at the top
>        while (top != null && term.compareTo(top.term) == 0) {
>          match[matchSize++] = (SegmentMergeInfo)queue.pop();
>          top = (SegmentMergeInfo)queue.top();
>        }
>  
> +      // perform the merge for all segments that are positioned on
> +      // the same term
>        mergeTermInfo(match, matchSize);		  // add new TermInfo
>        
> +      // advance the matched segments to the next term and, if one
> exists, put
> +      // the segment back onto the queue (priority queue takes care
> of sorting them)
>        while (matchSize > 0) {
>          SegmentMergeInfo smi = match[--matchSize];
>          if (smi.next())
> @@ -239,6 +268,14 @@
>  
>    private final TermInfo termInfo = new TermInfo(); // minimize
> consing
>  
> +
> +  /** Merge one term found in one or more segments. The array
> <code>smis</code>
> +   *  contains segments that are positioned at the same term.
> <code>N</code>
> +   *  is the number of cells in the array actually occupied.
> +   *
> +   * @param smis array of segments
> +   * @param n number of cells in the array actually occupied
> +   */
>    private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
>         throws IOException {
>      long freqPointer = freqOutput.getFilePointer();
> @@ -253,6 +290,14 @@
>      }
>    }
>  
> +  /** Process postings from multiple segments all positioned on the
> +   *  same term. Writes out merged entries into freqOutput and
> +   *  the proxOutput streams.
> +   *
> +   * @param smis array of segments
> +   * @param n number of cells in the array actually occupied
> +   * @return number of documents across all segments where this term
> was found
> +   */
>    private final int appendPostings(SegmentMergeInfo[] smis, int n)
>         throws IOException {
>      int lastDoc = 0;
> @@ -295,6 +340,10 @@
>      }
>      return df;
>    }
> +
> +  /** Merge field normalization factors for the specified segment
> readers.
> +   *  Called from <code>merge</code>.
> +   */
>    private final void mergeNorms() throws IOException {
>      for (int i = 0; i < fieldInfos.size(); i++) {
>        FieldInfo fi = fieldInfos.fieldInfo(i);
> 
> >
---------------------------------------------------------------------
> To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


__________________________________
Do you Yahoo!?
New Yahoo! Photos - easier uploading and sharing.
http://photos.yahoo.com/

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org