You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by cu...@apache.org on 2004/10/08 17:58:49 UTC
cvs commit: jakarta-lucene/src/java/org/apache/lucene/index TermBuffer.java SegmentTermEnum.java TermInfosReader.java

cutting     2004/10/08 08:58:49

  Modified:    .        CHANGES.txt
               src/java/org/apache/lucene/index SegmentTermEnum.java
                        TermInfosReader.java
  Added:       src/java/org/apache/lucene/index TermBuffer.java
  Log:
  Optimize term dictionary lookup to allocate fewer terms.
  
  Revision  Changes    Path
  1.118     +6 -1      jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.117
  retrieving revision 1.118
  diff -u -r1.117 -r1.118
  --- CHANGES.txt	6 Oct 2004 12:15:05 -0000	1.117
  +++ CHANGES.txt	8 Oct 2004 15:58:49 -0000	1.118
  @@ -97,6 +97,11 @@
   21. Add a serializable Parameter Class to standardize parameter enum
       classes in BooleanClause and Field. (Christoph)
   
  +22. Optimize term-dictionary lookup to allocate far fewer terms when
  +    scanning for the matching term.  This speeds searches involving
  +    low-frequency terms, where the cost of dictionary lookup can be
  +    significant. (cutting)
  +
   
   1.4.1
   
  
  
  
  1.9       +25 -28    jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java
  
  Index: SegmentTermEnum.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentTermEnum.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- SegmentTermEnum.java	16 Sep 2004 21:13:37 -0000	1.8
  +++ SegmentTermEnum.java	8 Oct 2004 15:58:49 -0000	1.9
  @@ -25,7 +25,10 @@
     long size;
     long position = -1;
   
  -  private Term term = new Term("", "");
  +  private TermBuffer termBuffer = new TermBuffer();
  +  private TermBuffer prevBuffer = new TermBuffer();
  +  private TermBuffer scratch;                     // used for scanning
  +
     private TermInfo termInfo = new TermInfo();
   
     private int format;
  @@ -34,9 +37,6 @@
     int indexInterval;
     int skipInterval;
     private int formatM1SkipInterval;
  -  Term prev;
  -
  -  private char[] buffer = {};
   
     SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi)
             throws IOException {
  @@ -89,7 +89,10 @@
   
       clone.input = (IndexInput) input.clone();
       clone.termInfo = new TermInfo(termInfo);
  -    if (term != null) clone.growBuffer(term.text.length());
  +
  +    clone.termBuffer = (TermBuffer)termBuffer.clone();
  +    clone.prevBuffer = (TermBuffer)prevBuffer.clone();
  +    clone.scratch = null;
   
       return clone;
     }
  @@ -98,21 +101,20 @@
             throws IOException {
       input.seek(pointer);
       position = p;
  -    term = t;
  -    prev = null;
  +    termBuffer.set(t);
  +    prevBuffer.reset();
       termInfo.set(ti);
  -    growBuffer(term.text.length());		  // copy term text into buffer
     }
   
     /** Increments the enumeration to the next element.  True if one exists.*/
     public final boolean next() throws IOException {
       if (position++ >= size - 1) {
  -      term = null;
  +      termBuffer.reset();
         return false;
       }
   
  -    prev = term;
  -    term = readTerm();
  +    prevBuffer.set(termBuffer);
  +    termBuffer.read(input, fieldInfos);
   
       termInfo.docFreq = input.readVInt();	  // read doc freq
       termInfo.freqPointer += input.readVLong();	  // read freq pointer
  @@ -138,28 +140,23 @@
       return true;
     }
   
  -  private final Term readTerm() throws IOException {
  -    int start = input.readVInt();
  -    int length = input.readVInt();
  -    int totalLength = start + length;
  -    if (buffer.length < totalLength)
  -      growBuffer(totalLength);
  -
  -    input.readChars(buffer, start, length);
  -    return new Term(fieldInfos.fieldName(input.readVInt()),
  -            new String(buffer, 0, totalLength), false);
  -  }
  -
  -  private final void growBuffer(int length) {
  -    buffer = new char[length];
  -    for (int i = 0; i < term.text.length(); i++)  // copy contents
  -      buffer[i] = term.text.charAt(i);
  +  /** Optimized scan, without allocating new terms. */
  +  final void scanTo(Term term) throws IOException {
  +    if (scratch == null)
  +      scratch = new TermBuffer();
  +    scratch.set(term);
  +    while (scratch.compareTo(termBuffer) > 0 && next()) {}
     }
   
     /** Returns the current Term in the enumeration.
      Initially invalid, valid after next() called for the first time.*/
     public final Term term() {
  -    return term;
  +    return termBuffer.toTerm();
  +  }
  +
  +  /** Returns the previous Term enumerated. Initially null.*/
  +  final Term prev() {
  +    return prevBuffer.toTerm();
     }
   
     /** Returns the current TermInfo in the enumeration.
  
  
  
  1.11      +2 -2      jakarta-lucene/src/java/org/apache/lucene/index/TermInfosReader.java
  
  Index: TermInfosReader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/TermInfosReader.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- TermInfosReader.java	16 Sep 2004 21:13:37 -0000	1.10
  +++ TermInfosReader.java	8 Oct 2004 15:58:49 -0000	1.11
  @@ -129,7 +129,7 @@
       // optimize sequential access: first try scanning cached enum w/o seeking
       SegmentTermEnum enumerator = getEnum();
       if (enumerator.term() != null                 // term is at or past current
  -	&& ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0)
  +	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
   	    || term.compareTo(enumerator.term()) >= 0)) {
         int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
         if (indexTerms.length == enumOffset	  // but before end of block
  @@ -145,7 +145,7 @@
     /** Scans within block for matching term. */
     private final TermInfo scanEnum(Term term) throws IOException {
       SegmentTermEnum enumerator = getEnum();
  -    while (term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
  +    enumerator.scanTo(term);
       if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0)
         return enumerator.termInfo();
       else
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/index/TermBuffer.java
  
  Index: TermBuffer.java
  ===================================================================
  package org.apache.lucene.index;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import java.io.IOException;
  import org.apache.lucene.store.IndexInput;
  
  final class TermBuffer implements Cloneable {
    private static final char[] NO_CHARS = new char[0];
  
    private String field;
    private char[] text = NO_CHARS;
    private int textLength;
    private Term term;                            // cached
  
    public final int compareTo(TermBuffer other) {
      if (field == other.field)			  // fields are interned
        return compareChars(text, textLength, other.text, other.textLength);
      else
        return field.compareTo(other.field);
    }
  
    private static final int compareChars(char[] v1, int len1,
                                          char[] v2, int len2) {
      int end = Math.min(len1, len2);
      for (int k = 0; k < end; k++) {
        char c1 = v1[k];
        char c2 = v2[k];
        if (c1 != c2) {
          return c1 - c2;
        }
      }
      return len1 - len2;
    }
  
    private final void setTextLength(int newLength) {
      if (text.length < newLength) {
        char[] newText = new char[newLength];
        System.arraycopy(text, 0, newText, 0, textLength);
        text = newText;
      }
      textLength = newLength;
    }
  
    public final void read(IndexInput input, FieldInfos fieldInfos)
      throws IOException {
      this.term = null;                           // invalidate cache
      int start = input.readVInt();
      int length = input.readVInt();
      int totalLength = start + length;
      setTextLength(totalLength);
      input.readChars(this.text, start, length);
      this.field = fieldInfos.fieldName(input.readVInt());
    }
  
    public final void set(Term term) {
      if (term == null) {
        reset();
        return;
      }
  
      // copy text into the buffer
      setTextLength(term.text().length());
      term.text().getChars(0, term.text().length(), text, 0);
  
      this.field = term.field();
      this.term = term;
    }
  
    public final void set(TermBuffer other) {
      setTextLength(other.textLength);
      System.arraycopy(other.text, 0, text, 0, textLength);
  
      this.field = other.field;
      this.term = other.term;
    }
  
    public void reset() {
      this.field = null;
      this.textLength = 0;
      this.term = null;
    }
  
    public Term toTerm() {
      if (field == null)                            // unset
        return null;
  
      if (term == null)
        term = new Term(field, new String(text, 0, textLength), false);
  
      return term;
    }
  
    protected Object clone() {
      TermBuffer clone = null;
      try {
        clone = (TermBuffer)super.clone();
      } catch (CloneNotSupportedException e) {}
  
      clone.text = new char[text.length];
      System.arraycopy(text, 0, clone.text, 0, textLength);
  
      return clone;
    }
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org