You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Damian Gajda <dg...@caltha.pl> on 2003/12/09 20:44:00 UTC
Re: Revival of Dmitry's Term Vector patches
Hello Otis,
Here is a patch with documentation from Dmitry.
I used
cvs diff -uN
Hope it is OK now.
--
Damian
Re: Revival of Dmitry's Term Vector patches
Posted by Otis Gospodnetic <ot...@yahoo.com>.
Hello,
Thanks Damian!
This came inlined for me (i.e. wrapped/broken lines, etc.)
It could be just my email client (Yahoo's web mail) inlining this.
If anyone received this is as a real attachment, could you apply this
patch?
Thanks,
Otis
--- Damian Gajda <dg...@caltha.pl> wrote:
> Hello Otis,
>
> Here is a patch with documentation from Dmitry.
>
> I used
> cvs diff -uN
>
> Hope it is OK now.
>
> --
> Damian
>
> > Index: src/java/org/apache/lucene/document/Field.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
> retrieving revision 1.11
> diff -u -r1.11 Field.java
> --- src/java/org/apache/lucene/document/Field.java 20 Mar 2003
> 18:28:13 -0000 1.11
> +++ src/java/org/apache/lucene/document/Field.java 9 Dec 2003
> 19:39:05 -0000
> @@ -162,6 +162,8 @@
> is used. Exactly one of stringValue() and readerValue() must be
> set. */
> public Reader readerValue() { return readerValue; }
>
> + /** Create a field by specifying all parameters.
> + */
> public Field(String name, String string,
> boolean store, boolean index, boolean token) {
> if (name == null)
> Index: src/java/org/apache/lucene/index/FieldInfos.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v
> retrieving revision 1.4
> diff -u -r1.4 FieldInfos.java
> --- src/java/org/apache/lucene/index/FieldInfos.java 21 Oct 2003
> 17:59:16 -0000 1.4
> +++ src/java/org/apache/lucene/index/FieldInfos.java 9 Dec 2003
> 19:39:05 -0000
> @@ -68,6 +68,12 @@
> import org.apache.lucene.store.OutputStream;
> import org.apache.lucene.store.InputStream;
>
> +/** Access to the Field Info file that describes document fields and
> whether or
> + * not they are indexed. Each segment has a separate Field Info
> file. Objects
> + * of this class is thread-safe for multiple readers, but only one
> thread can
> + * be adding documents at a time, with no other reader or writer
> threads
> + * accessing this object.
> + */
> final class FieldInfos {
> private Vector byNumber = new Vector();
> private Hashtable byName = new Hashtable();
> @@ -94,6 +100,10 @@
> }
> }
>
> + /** Adds in information for a set of FieldInfos.
> + * Returns an array mapping each field number in the
> <code>names</code>
> + * collection to the field numbers in this one.
> + */
> final void add(Collection names, boolean isIndexed) {
> Iterator i = names.iterator();
> while (i.hasNext()) {
> @@ -101,6 +111,10 @@
> }
> }
>
> + /** If the field is not yet known, adds it. If it is known, checks
> + * to make sure that the isIndexed flag is the same as was given
> + * previously for this field. If not - throws
> IllegalStateException.
> + */
> final void add(String name, boolean isIndexed) {
> FieldInfo fi = fieldInfo(name);
> if (fi == null)
> Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java,v
> retrieving revision 1.2
> diff -u -r1.2 SegmentMergeInfo.java
> --- src/java/org/apache/lucene/index/SegmentMergeInfo.java 21 Oct
> 2003 17:59:16 -0000 1.2
> +++ src/java/org/apache/lucene/index/SegmentMergeInfo.java 9 Dec 2003
> 19:39:06 -0000
> @@ -57,14 +57,38 @@
> import java.io.IOException;
> import org.apache.lucene.util.BitVector;
>
> +/** Data container to work with SegmentMergeQueue. Represents a
> single segment
> + * to be merged. Maintains the segment reader, TermEnum, and
> TermPositions
> + * for this segment.
> + */
> final class SegmentMergeInfo {
> + /** The current term of this segment, or null if none. */
> Term term;
> +
> + /** Index of the 0th document from this segment in the merged
> document numbering. */
> int base;
> +
> + /** This segment's term enum. Do not use directly. */
> TermEnum termEnum;
> +
> + /** This segment's reader. Do not use directly. */
> IndexReader reader;
> +
> + /** Postings for the current term. */
> TermPositions postings;
> +
> +
> + /** Maps around deleted docs. Contains a slot for each document in
> the
> + * reader. Slots corresponding to deleted docs have the value of
> -1. The
> + * rest have their new document numbers that start at 0. This
> value
> + * added to <code>base</code> is the document number in the
> merged numbering.
> + */
> int[] docMap = null; // maps around deleted docs
>
> + /** Create a new merge info. Base <code>b</code> is a starting
> + * number for documents from this segment in the merged document
> + * numbering.
> + */
> SegmentMergeInfo(int b, TermEnum te, IndexReader r)
> throws IOException {
> base = b;
> @@ -87,6 +111,12 @@
> }
> }
>
> +
> + /** Shift to the next term on this segment's TermEnum. The new
> + * term becomes the current term for this segment, effecting the
> + * ordering of the SegmentMergeQueue. If no more terms remain
> + * in this segment, returns false and resets the current term to
> null.
> + */
> final boolean next() throws IOException {
> if (termEnum.next()) {
> term = termEnum.term();
> Index: src/java/org/apache/lucene/index/SegmentMergeQueue.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java,v
> retrieving revision 1.1.1.1
> diff -u -r1.1.1.1 SegmentMergeQueue.java
> --- src/java/org/apache/lucene/index/SegmentMergeQueue.java 18 Sep
> 2001 16:29:53 -0000 1.1.1.1
> +++ src/java/org/apache/lucene/index/SegmentMergeQueue.java 9 Dec
> 2003 19:39:06 -0000
> @@ -57,6 +57,10 @@
> import java.io.IOException;
> import org.apache.lucene.util.PriorityQueue;
>
> +/** Priority queue of SegmentMergeInfo objects. The queue sorts the
> + * info objects by their current term, and if the terms are equal,
> + * by their base offset.
> + */
> final class SegmentMergeQueue extends PriorityQueue {
> SegmentMergeQueue(int size) {
> initialize(size);
> Index: src/java/org/apache/lucene/index/SegmentMerger.java
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v
> retrieving revision 1.6
> diff -u -r1.6 SegmentMerger.java
> --- src/java/org/apache/lucene/index/SegmentMerger.java 31 Oct 2003
> 09:28:44 -0000 1.6
> +++ src/java/org/apache/lucene/index/SegmentMerger.java 9 Dec 2003
> 19:39:07 -0000
> @@ -77,20 +77,33 @@
> "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
> };
>
> + /** Create a segment merger that will merge a number of segments
> (specified
> + * as SegmentReaders added to this object with calls to
> <code>add</code>) into a
> + * single segment with the specified <code>name</code>.
> + */
> SegmentMerger(Directory dir, String name, boolean compoundFile) {
> directory = dir;
> segment = name;
> useCompoundFile = compoundFile;
> }
>
> + /** Add segment reader to be merged.
> + *
> + */
> final void add(IndexReader reader) {
> readers.addElement(reader);
> }
>
> + /** Return one of the segment readers being merged.
> + *
> + */
> final IndexReader segmentReader(int i) {
> return (IndexReader)readers.elementAt(i);
> }
>
> + /** Start the merge. All segment readers to be merged must have
> been added
> + * prior to this call.
> + */
> final int merge() throws IOException {
> int value;
> try {
> @@ -148,6 +161,9 @@
> }
>
>
> + /** Merge the field information from the segment readers.
> + * Called from <code>merge</code>.
> + */
> private final int mergeFields() throws IOException {
> fieldInfos = new FieldInfos(); // merge field names
> int docCount = 0;
> @@ -181,6 +197,9 @@
> private TermInfosWriter termInfosWriter = null;
> private SegmentMergeQueue queue = null;
>
> + /** Merge the term index, frequency and proximity information
> + * from specified segment readers. Called from
> <code>merge</code>.
> + */
> private final void mergeTerms() throws IOException {
> try {
> freqOutput = directory.createFile(segment + ".frq");
> @@ -198,7 +217,11 @@
> }
> }
>
> + /** Merge the term index information. Called from
> <code>mergeTerms</code>.
> + */
> private final void mergeTermInfos() throws IOException {
> + // Create and populate a priority queue of segments to be merged.
> + // Segments are sorted by their top term and the base doc number in
> the merged segment.
> queue = new SegmentMergeQueue(readers.size());
> int base = 0;
> for (int i = 0; i < readers.size(); i++) {
> @@ -220,13 +243,19 @@
> Term term = match[0].term;
> SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
>
> + // pop off the queue and put into match[] all segments
> + // that have the same term at the top
> while (top != null && term.compareTo(top.term) == 0) {
> match[matchSize++] = (SegmentMergeInfo)queue.pop();
> top = (SegmentMergeInfo)queue.top();
> }
>
> + // perform the merge for all segments that are positioned on
> + // the same term
> mergeTermInfo(match, matchSize); // add new TermInfo
>
> + // advance the matched segments to the next term and, if one
> exists, put
> + // the segment back onto the queue (priority queue takes care
> of sorting them)
> while (matchSize > 0) {
> SegmentMergeInfo smi = match[--matchSize];
> if (smi.next())
> @@ -239,6 +268,14 @@
>
> private final TermInfo termInfo = new TermInfo(); // minimize
> consing
>
> +
> + /** Merge one term found in one or more segments. The array
> <code>smis</code>
> + * contains segments that are positioned at the same term.
> <code>N</code>
> + * is the number of cells in the array actually occupied.
> + *
> + * @param smis array of segments
> + * @param n number of cells in the array actually occupied
> + */
> private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
> throws IOException {
> long freqPointer = freqOutput.getFilePointer();
> @@ -253,6 +290,14 @@
> }
> }
>
> + /** Process postings from multiple segments all positioned on the
> + * same term. Writes out merged entries into freqOutput and
> + * the proxOutput streams.
> + *
> + * @param smis array of segments
> + * @param n number of cells in the array actually occupied
> + * @return number of documents across all segments where this term
> was found
> + */
> private final int appendPostings(SegmentMergeInfo[] smis, int n)
> throws IOException {
> int lastDoc = 0;
> @@ -295,6 +340,10 @@
> }
> return df;
> }
> +
> + /** Merge field normalization factors for the specified segment
> readers.
> + * Called from <code>merge</code>.
> + */
> private final void mergeNorms() throws IOException {
> for (int i = 0; i < fieldInfos.size(); i++) {
> FieldInfo fi = fieldInfos.fieldInfo(i);
>
> >
---------------------------------------------------------------------
> To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: lucene-dev-help@jakarta.apache.org
__________________________________
Do you Yahoo!?
New Yahoo! Photos - easier uploading and sharing.
http://photos.yahoo.com/
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org