You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/08/10 20:34:35 UTC
svn commit: r564715 - in /lucene/java/trunk: ./
contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/
src/java/org/apache/lucene/analysis/
src/java/org/apache/lucene/analysis/standard/
src/java/org/apache/lucene/index/ src/test/org/apach...
Author: mikemccand
Date: Fri Aug 10 11:34:33 2007
New Revision: 564715
URL: http://svn.apache.org/viewvc?view=rev&rev=564715
Log:
LUCENE-969: deprecate Token.termText() & optimize core tokenizers by re-using tokens & TokenStreams
Added:
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Aug 10 11:34:33 2007
@@ -22,6 +22,12 @@
Field instance during indexing. This is a sizable performance
gain, especially for small documents. (Mike McCandless)
+ 4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to
+ permit re-using of Token and TokenStream instances during
+ indexing. Changed Token to use a char[] as the store for the
+ termText instead of String. This gives faster tokenization
+ performance (~10-15%). (Mike McCandless)
+
Bug fixes
1. LUCENE-933: QueryParser fixed to not produce empty sub
@@ -106,6 +112,10 @@
StandardTokenizer (StandardAnalyzer) by using JFlex instead of
JavaCC to generate the tokenizer.
(Stanislaw Osinski via Mike McCandless)
+
+ 8. LUCENE-969: Changed core tokenizers & filters to re-use Token and
+ TokenStream instances when possible to improve tokenization
+ performance (~10-15%). (Mike McCandless)
Documentation
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Fri Aug 10 11:34:33 2007
@@ -73,7 +73,7 @@
super.tearDown();
}
- Token token = new Token("", 0, 0);
+ Token token = new Token();
public int doLogic() throws Exception {
List fields = doc.getFields();
@@ -104,13 +104,13 @@
}
// Tokenize field
- stream = analyzer.tokenStream(field.name(), reader);
+ stream = analyzer.reusableTokenStream(field.name(), reader);
}
// reset the TokenStream to the first token
stream.reset();
- while(stream.next() != null)
+ while(stream.next(token) != null)
tokenCount++;
}
totalTokenCount += tokenCount;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Analyzer.java Fri Aug 10 11:34:33 2007
@@ -18,6 +18,7 @@
*/
import java.io.Reader;
+import java.io.IOException;
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
@@ -37,6 +38,33 @@
field name for backward compatibility. */
public abstract TokenStream tokenStream(String fieldName, Reader reader);
+ /** Creates a TokenStream that is allowed to be re-used
+ * from the previous time that the same thread called
+ * this method. Callers that do not need to use more
+ * than one TokenStream at the same time from this
+ * analyzer should use this method for better
+ * performance.
+ */
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ return tokenStream(fieldName, reader);
+ }
+
+ private ThreadLocal tokenStreams = new ThreadLocal();
+
+ /** Used by Analyzers that implement reusableTokenStream
+ * to retrieve previously saved TokenStreams for re-use
+ * by the same thread. */
+ protected Object getPreviousTokenStream() {
+ return tokenStreams.get();
+ }
+
+ /** Used by Analyzers that implement reusableTokenStream
+ * to save a TokenStream for later re-use by the same
+ * thread. */
+ protected void setPreviousTokenStream(Object obj) {
+ tokenStreams.set(obj);
+ }
+
/**
* Invoked before indexing a Fieldable instance if
@@ -56,4 +84,3 @@
return 0;
}
}
-
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java?view=auto&rev=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java Fri Aug 10 11:34:33 2007
@@ -0,0 +1,149 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * A simple class that can store & retrieve char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove char[]'s from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc. It is designed for use with StopFilter to enable
+ * quick filtering based on the char[] termBuffer in a
+ * Token.
+ */
+
+final class CharArraySet {
+
+ private final static int INIT_SIZE = 8;
+ private final static double MAX_LOAD_FACTOR = 0.75;
+ private int mask;
+ private char[][] entries;
+ private int count;
+ private boolean ignoreCase;
+
+ /** Create set with enough capacity to hold startSize
+ * terms */
+ public CharArraySet(int startSize, boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ while(((double) startSize)/size >= MAX_LOAD_FACTOR)
+ size *= 2;
+ mask = size-1;
+ entries = new char[size][];
+ }
+
+ /** Returns true if the characters in text up to length
+ * len is present in the set. */
+ public boolean contains(char[] text, int len) {
+ int code = getHashCode(text, len);
+ int pos = code & mask;
+ char[] text2 = entries[pos];
+ if (text2 != null && !equals(text, len, text2)) {
+ final int inc = code*1347|1;
+ do {
+ code += inc;
+ pos = code & mask;
+ text2 = entries[pos];
+ } while (text2 != null && !equals(text, len, text2));
+ }
+ return text2 != null;
+ }
+
+ /** Add this String into the set */
+ public void add(String text) {
+ add(text.toCharArray());
+ }
+
+ /** Add this text into the set */
+ public void add(char[] text) {
+ if (ignoreCase)
+ for(int i=0;i<text.length;i++)
+ text[i] = Character.toLowerCase(text[i]);
+ int code = getHashCode(text, text.length);
+ int pos = code & mask;
+ char[] text2 = entries[pos];
+ if (text2 != null) {
+ final int inc = code*1347|1;
+ do {
+ code += inc;
+ pos = code & mask;
+ text2 = entries[pos];
+ } while (text2 != null);
+ }
+ entries[pos] = text;
+ count++;
+
+ if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
+ rehash();
+ }
+ }
+
+ private boolean equals(char[] text1, int len, char[] text2) {
+ if (len != text2.length)
+ return false;
+ for(int i=0;i<len;i++) {
+ if (ignoreCase) {
+ if (Character.toLowerCase(text1[i]) != text2[i])
+ return false;
+ } else {
+ if (text1[i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void rehash() {
+ final int newSize = 2*count;
+ mask = newSize-1;
+
+ char[][] newEntries = new char[newSize][];
+ for(int i=0;i<entries.length;i++) {
+ char[] text = entries[i];
+ if (text != null) {
+ int code = getHashCode(text, text.length);
+ int pos = code & mask;
+ if (newEntries[pos] != null) {
+ final int inc = code*1347|1;
+ do {
+ code += inc;
+ pos = code & mask;
+ } while (newEntries[pos] != null);
+ }
+ newEntries[pos] = text;
+ }
+ }
+
+ entries = newEntries;
+ }
+
+ private int getHashCode(char[] text, int len) {
+ int downto = len;
+ int code = 0;
+ while (downto > 0) {
+ final char c;
+ if (ignoreCase)
+ c = Character.toLowerCase(text[--downto]);
+ else
+ c = text[--downto];
+ code = (code*31) + c;
+ }
+ return code;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharArraySet.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Aug 10 11:34:33 2007
@@ -28,8 +28,7 @@
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
- private static final int IO_BUFFER_SIZE = 1024;
- private final char[] buffer = new char[MAX_WORD_LEN];
+ private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** Returns true iff a character should be included in a token. This
@@ -45,31 +44,32 @@
return c;
}
- /** Returns the next token in the stream, or null at EOS. */
- public final Token next() throws IOException {
+ public final Token next(Token token) throws IOException {
int length = 0;
- int start = offset;
+ int start = bufferIndex;
+ char[] buffer = token.termBuffer();
while (true) {
- final char c;
- offset++;
if (bufferIndex >= dataLen) {
+ offset += dataLen;
dataLen = input.read(ioBuffer);
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return null;
+ }
bufferIndex = 0;
}
- ;
- if (dataLen == -1) {
- if (length > 0)
- break;
- else
- return null;
- } else
- c = ioBuffer[bufferIndex++];
+
+ final char c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
- start = offset - 1;
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.length)
+ buffer = token.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized
@@ -78,9 +78,18 @@
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
-
}
- return new Token(new String(buffer, 0, length), start, start + length);
+ token.termLength = length;
+ token.startOffset = start;
+ token.endOffset = start+length;
+ return token;
+ }
+
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java Fri Aug 10 11:34:33 2007
@@ -25,144 +25,166 @@
* <p>
*/
public class ISOLatin1AccentFilter extends TokenFilter {
- public ISOLatin1AccentFilter(TokenStream input) {
- super(input);
- }
+ public ISOLatin1AccentFilter(TokenStream input) {
+ super(input);
+ }
- public final Token next() throws java.io.IOException {
- final Token t = input.next();
- if (t != null)
- t.setTermText(removeAccents(t.termText()));
- return t;
- }
+ private char[] output = new char[256];
+ private int outputPos;
- /**
- * To replace accented characters in a String by unaccented equivalents.
- */
- public final static String removeAccents(String input) {
- final StringBuffer output = new StringBuffer();
- for (int i = 0; i < input.length(); i++) {
- switch (input.charAt(i)) {
- case '\u00C0' : // Ã
- case '\u00C1' : // Ã
- case '\u00C2' : // Ã
- case '\u00C3' : // Ã
- case '\u00C4' : // Ã
- case '\u00C5' : // Ã
- output.append("A");
- break;
- case '\u00C6' : // Ã
- output.append("AE");
- break;
- case '\u00C7' : // Ã
- output.append("C");
- break;
- case '\u00C8' : // Ã
- case '\u00C9' : // Ã
- case '\u00CA' : // Ã
- case '\u00CB' : // Ã
- output.append("E");
- break;
- case '\u00CC' : // Ã
- case '\u00CD' : // Ã
- case '\u00CE' : // Ã
- case '\u00CF' : // Ã
- output.append("I");
- break;
- case '\u00D0' : // Ã
- output.append("D");
- break;
- case '\u00D1' : // Ã
- output.append("N");
- break;
- case '\u00D2' : // Ã
- case '\u00D3' : // Ã
- case '\u00D4' : // Ã
- case '\u00D5' : // Ã
- case '\u00D6' : // Ã
- case '\u00D8' : // Ã
- output.append("O");
- break;
- case '\u0152' : // Å
- output.append("OE");
- break;
- case '\u00DE' : // Ã
- output.append("TH");
- break;
- case '\u00D9' : // Ã
- case '\u00DA' : // Ã
- case '\u00DB' : // Ã
- case '\u00DC' : // Ã
- output.append("U");
- break;
- case '\u00DD' : // Ã
- case '\u0178' : // Ÿ
- output.append("Y");
- break;
- case '\u00E0' : // Ã
- case '\u00E1' : // á
- case '\u00E2' : // â
- case '\u00E3' : // ã
- case '\u00E4' : // ä
- case '\u00E5' : // å
- output.append("a");
- break;
- case '\u00E6' : // æ
- output.append("ae");
- break;
- case '\u00E7' : // ç
- output.append("c");
- break;
- case '\u00E8' : // è
- case '\u00E9' : // é
- case '\u00EA' : // ê
- case '\u00EB' : // ë
- output.append("e");
- break;
- case '\u00EC' : // ì
- case '\u00ED' : // Ã
- case '\u00EE' : // î
- case '\u00EF' : // ï
- output.append("i");
- break;
- case '\u00F0' : // ð
- output.append("d");
- break;
- case '\u00F1' : // ñ
- output.append("n");
- break;
- case '\u00F2' : // ò
- case '\u00F3' : // ó
- case '\u00F4' : // ô
- case '\u00F5' : // õ
- case '\u00F6' : // ö
- case '\u00F8' : // ø
- output.append("o");
- break;
- case '\u0153' : // Å
- output.append("oe");
- break;
- case '\u00DF' : // Ã
- output.append("ss");
- break;
- case '\u00FE' : // þ
- output.append("th");
- break;
- case '\u00F9' : // ù
- case '\u00FA' : // ú
- case '\u00FB' : // û
- case '\u00FC' : // ü
- output.append("u");
- break;
- case '\u00FD' : // ý
- case '\u00FF' : // ÿ
- output.append("y");
- break;
- default :
- output.append(input.charAt(i));
- break;
- }
- }
- return output.toString();
- }
-}
\ No newline at end of file
+ public final Token next(Token result) throws java.io.IOException {
+ result = input.next(result);
+ if (result != null) {
+ outputPos = 0;
+ removeAccents(result.termBuffer(), result.termLength());
+ result.setTermBuffer(output, 0, outputPos);
+ return result;
+ } else
+ return null;
+ }
+
+ private final void addChar(char c) {
+ if (outputPos == output.length) {
+ char[] newArray = new char[2*output.length];
+ System.arraycopy(output, 0, newArray, 0, output.length);
+ output = newArray;
+ }
+ output[outputPos++] = c;
+ }
+
+ /**
+ * To replace accented characters in a String by unaccented equivalents.
+ */
+ public final void removeAccents(char[] input, int length) {
+ int pos = 0;
+ for (int i=0; i<length; i++, pos++) {
+ switch (input[pos]) {
+ case '\u00C0' : // Ã
+ case '\u00C1' : // Ã
+ case '\u00C2' : // Ã
+ case '\u00C3' : // Ã
+ case '\u00C4' : // Ã
+ case '\u00C5' : // Ã
+ addChar('A');
+ break;
+ case '\u00C6' : // Ã
+ addChar('A');
+ addChar('E');
+ break;
+ case '\u00C7' : // Ã
+ addChar('C');
+ break;
+ case '\u00C8' : // Ã
+ case '\u00C9' : // Ã
+ case '\u00CA' : // Ã
+ case '\u00CB' : // Ã
+ addChar('E');
+ break;
+ case '\u00CC' : // Ã
+ case '\u00CD' : // Ã
+ case '\u00CE' : // Ã
+ case '\u00CF' : // Ã
+ addChar('I');
+ break;
+ case '\u00D0' : // Ã
+ addChar('D');
+ break;
+ case '\u00D1' : // Ã
+ addChar('N');
+ break;
+ case '\u00D2' : // Ã
+ case '\u00D3' : // Ã
+ case '\u00D4' : // Ã
+ case '\u00D5' : // Ã
+ case '\u00D6' : // Ã
+ case '\u00D8' : // Ã
+ addChar('O');
+ break;
+ case '\u0152' : // Å
+ addChar('O');
+ addChar('E');
+ break;
+ case '\u00DE' : // Ã
+ addChar('T');
+ addChar('H');
+ break;
+ case '\u00D9' : // Ã
+ case '\u00DA' : // Ã
+ case '\u00DB' : // Ã
+ case '\u00DC' : // Ã
+ addChar('U');
+ break;
+ case '\u00DD' : // Ã
+ case '\u0178' : // Ÿ
+ addChar('Y');
+ break;
+ case '\u00E0' : // Ã
+ case '\u00E1' : // á
+ case '\u00E2' : // â
+ case '\u00E3' : // ã
+ case '\u00E4' : // ä
+ case '\u00E5' : // å
+ addChar('a');
+ break;
+ case '\u00E6' : // æ
+ addChar('a');
+ addChar('e');
+ break;
+ case '\u00E7' : // ç
+ addChar('c');
+ break;
+ case '\u00E8' : // è
+ case '\u00E9' : // é
+ case '\u00EA' : // ê
+ case '\u00EB' : // ë
+ addChar('e');
+ break;
+ case '\u00EC' : // ì
+ case '\u00ED' : // Ã
+ case '\u00EE' : // î
+ case '\u00EF' : // ï
+ addChar('i');
+ break;
+ case '\u00F0' : // ð
+ addChar('d');
+ break;
+ case '\u00F1' : // ñ
+ addChar('n');
+ break;
+ case '\u00F2' : // ò
+ case '\u00F3' : // ó
+ case '\u00F4' : // ô
+ case '\u00F5' : // õ
+ case '\u00F6' : // ö
+ case '\u00F8' : // ø
+ addChar('o');
+ break;
+ case '\u0153' : // Å
+ addChar('o');
+ addChar('e');
+ break;
+ case '\u00DF' : // Ã
+ addChar('s');
+ addChar('s');
+ break;
+ case '\u00FE' : // þ
+ addChar('t');
+ addChar('h');
+ break;
+ case '\u00F9' : // ù
+ case '\u00FA' : // ú
+ case '\u00FB' : // û
+ case '\u00FC' : // ü
+ addChar('u');
+ break;
+ case '\u00FD' : // ý
+ case '\u00FF' : // ÿ
+ addChar('y');
+ break;
+ default :
+ addChar(input[pos]);
+ break;
+ }
+ }
+ }
+}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java Fri Aug 10 11:34:33 2007
@@ -28,4 +28,13 @@
final Reader reader) {
return new KeywordTokenizer(reader);
}
-}
\ No newline at end of file
+ public TokenStream reusableTokenStream(String fieldName,
+ final Reader reader) {
+ Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+ if (tokenizer == null) {
+ tokenizer = new KeywordTokenizer(reader);
+ setPreviousTokenStream(tokenizer);
+ }
+ return tokenizer;
+ }
+}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Fri Aug 10 11:34:33 2007
@@ -28,7 +28,6 @@
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
- private final char[] buffer;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
@@ -36,23 +35,23 @@
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
- this.buffer = new char[bufferSize];
this.done = false;
}
- public Token next() throws IOException {
+ public Token next(Token result) throws IOException {
if (!done) {
done = true;
- StringBuffer buffer = new StringBuffer();
- int length;
+ int upto = 0;
+ char[] buffer = result.termBuffer();
while (true) {
- length = input.read(this.buffer);
+ final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
-
- buffer.append(this.buffer, 0, length);
+ upto += length;
+ if (upto == buffer.length)
+ buffer = result.resizeTermBuffer(1+buffer.length);
}
- String text = buffer.toString();
- return new Token(text, 0, text.length());
+ result.termLength = upto;
+ return result;
}
return null;
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java Fri Aug 10 11:34:33 2007
@@ -44,12 +44,12 @@
/**
* Returns the next input Token whose termText() is the right len
*/
- public final Token next() throws IOException
+ public final Token next(Token result) throws IOException
{
// return the first non-stop word found
- for (Token token = input.next(); token != null; token = input.next())
+ for (Token token = input.next(result); token != null; token = input.next(result))
{
- int len = token.termText().length();
+ int len = token.termLength();
if (len >= min && len <= max) {
return token;
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java Fri Aug 10 11:34:33 2007
@@ -29,14 +29,17 @@
super(in);
}
- public final Token next() throws IOException {
- Token t = input.next();
+ public final Token next(Token result) throws IOException {
+ result = input.next(result);
+ if (result != null) {
- if (t == null)
- return null;
-
- t.termText = t.termText.toLowerCase();
+ final char[] buffer = result.termBuffer();
+ final int length = result.termLength;
+ for(int i=0;i<length;i++)
+ buffer[i] = Character.toLowerCase(buffer[i]);
- return t;
+ return result;
+ } else
+ return null;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java Fri Aug 10 11:34:33 2007
@@ -18,6 +18,7 @@
*/
import java.io.Reader;
+import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
@@ -73,6 +74,14 @@
}
return analyzer.tokenStream(fieldName, reader);
+ }
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
+ if (analyzer == null)
+ analyzer = defaultAnalyzer;
+
+ return analyzer.reusableTokenStream(fieldName, reader);
}
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java Fri Aug 10 11:34:33 2007
@@ -45,16 +45,13 @@
stemmer = new PorterStemmer();
}
- /** Returns the next input Token, after being stemmed */
- public final Token next() throws IOException {
- Token token = input.next();
- if (token == null)
+ public final Token next(Token result) throws IOException {
+ result = input.next(result);
+ if (result != null) {
+ if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+ result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
+ return result;
+ } else
return null;
- else {
- String s = stemmer.stem(token.termText);
- if (s != token.termText) // Yes, I mean object reference comparison here
- token.termText = s;
- return token;
- }
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java Fri Aug 10 11:34:33 2007
@@ -18,11 +18,22 @@
*/
import java.io.Reader;
+import java.io.IOException;
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
public final class SimpleAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader);
+ }
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+ if (tokenizer == null) {
+ tokenizer = new LowerCaseTokenizer(reader);
+ setPreviousTokenStream(tokenizer);
+ } else
+ tokenizer.reset(reader);
+ return tokenizer;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopAnalyzer.java Fri Aug 10 11:34:33 2007
@@ -71,5 +71,22 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
}
+
+ /** Filters LowerCaseTokenizer with StopFilter. */
+ private class SavedStreams {
+ Tokenizer source;
+ TokenStream result;
+ };
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ streams.source = new LowerCaseTokenizer(reader);
+ streams.result = new StopFilter(streams.source, stopWords);
+ setPreviousTokenStream(streams);
+ } else
+ streams.source.reset(reader);
+ return streams.result;
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Fri Aug 10 11:34:33 2007
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
/**
@@ -27,16 +28,16 @@
public final class StopFilter extends TokenFilter {
- private final Set stopWords;
+ private final CharArraySet stopWords;
private final boolean ignoreCase;
- /**
- * Construct a token stream filtering the given input.
- */
- public StopFilter(TokenStream input, String [] stopWords)
- {
- this(input, stopWords, false);
- }
+ /**
+ * Construct a token stream filtering the given input.
+ */
+ public StopFilter(TokenStream input, String [] stopWords)
+ {
+ this(input, stopWords, false);
+ }
/**
* Constructs a filter which removes words from the input
@@ -45,22 +46,25 @@
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
this.ignoreCase = ignoreCase;
- this.stopWords = makeStopSet(stopWords, ignoreCase);
+ this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
}
- /**
- * Construct a token stream filtering the given input.
- * @param input
- * @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
- * @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
- */
- public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
- {
- super(input);
- this.ignoreCase = ignoreCase;
- this.stopWords = stopWords;
- }
+ /**
+ * Construct a token stream filtering the given input.
+ * @param input
+ * @param stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
+ * @param ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
+ */
+ public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
+ {
+ super(input);
+ this.ignoreCase = ignoreCase;
+ this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
+ Iterator it = stopWords.iterator();
+ while(it.hasNext())
+ this.stopWords.add((String) it.next());
+ }
/**
* Constructs a filter which removes words from the input
@@ -97,18 +101,23 @@
for (int i = 0; i < stopWords.length; i++)
stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
return stopTable;
- }
+ }
+
+ private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
+ CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
+ for (int i = 0; i < stopWords.length; i++)
+ stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
+ return stopSet;
+ }
/**
* Returns the next input Token whose termText() is not a stop word.
*/
- public final Token next() throws IOException {
+ public final Token next(Token result) throws IOException {
// return the first non-stop word found
- for (Token token = input.next(); token != null; token = input.next())
- {
- String termText = ignoreCase ? token.termText.toLowerCase() : token.termText;
- if (!stopWords.contains(termText))
- return token;
+ while((result = input.next(result)) != null) {
+ if (!stopWords.contains(result.termBuffer(), result.termLength))
+ return result;
}
// reached EOS -- return null
return null;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Fri Aug 10 11:34:33 2007
@@ -1,8 +1,5 @@
package org.apache.lucene.analysis;
-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.TermPositions;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -20,6 +17,9 @@
* limitations under the License.
*/
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.TermPositions;
+
/** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -44,66 +44,110 @@
The APIs introduced here might change in the future and will not be
supported anymore in such a case.</font>
+ <br><br>
+
+ <p><b>NOTE:</b> As of 2.3, Token stores the term text
+ internally as a malleable char[] termBuffer instead of
+ String termText. The indexing code and core tokenizers
+ have been changed re-use a single Token instance, changing
+ its buffer and other fields in-place as the Token is
+ processed. This provides substantially better indexing
+ performance as it saves the GC cost of new'ing a Token and
+ String for every term. The APIs that accept String
+ termText are still available but a warning about the
+ associated performance cost has been added (below). The
+ {@link #termText()} method has been deprecated.</p>
+
+ <p>Tokenizers and filters should try to re-use a Token
+ instance when possible for best performance, by
+ implementing the {@link TokenStream#next(Token)} API.
+ Failing that, to create a new Token you should first use
+ one of the constructors that starts with null text. Then
+ you should call either {@link #termBuffer()} or {@link
+ #resizeTermBuffer(int)} to retrieve the Token's
+ termBuffer. Fill in the characters of your term into this
+ buffer, and finally call {@link #setTermLength(int)} to
+ set the length of the term text. See <a target="_top"
+ href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+ for details.</p>
+
@see org.apache.lucene.index.Payload
- */
- // TODO: Remove warning after API has been finalized
+*/
+
+// TODO: Remove warning after API has been finalized
+
public class Token implements Cloneable {
- String termText; // the text of the term
+
+ private static final String DEFAULT_TYPE = "word";
+ private static int MIN_BUFFER_SIZE = 10;
+
+ /** @deprecated: we will remove this when we remove the
+ * deprecated APIs */
+ private String termText;
+
+ char[] termBuffer; // characters for the term text
+ int termLength; // length of term text in buffer
+
int startOffset; // start in source text
int endOffset; // end in source text
- String type = "word"; // lexical type
+ String type = DEFAULT_TYPE; // lexical type
Payload payload;
- // For better indexing speed, use termBuffer (and
- // termBufferOffset/termBufferLength) instead of termText
- // to save new'ing a String per token
- char[] termBuffer;
- int termBufferOffset;
- int termBufferLength;
+ int positionIncrement = 1;
- private int positionIncrement = 1;
+ /** Constructs a Token will null text. */
+ public Token() {
+ }
- /** Constructs a Token with the given term text, and start & end offsets.
- The type defaults to "word." */
- public Token(String text, int start, int end) {
- termText = text;
+ /** Constructs a Token with null text and start & end
+ * offsets.
+ * @param start start offset
+ * @param end end offset */
+ public Token(int start, int end) {
startOffset = start;
endOffset = end;
}
- /** Constructs a Token with the given term text buffer
- * starting at offset for length lenth, and start & end offsets.
- * The type defaults to "word." */
- public Token(char[] text, int offset, int length, int start, int end) {
- termBuffer = text;
- termBufferOffset = offset;
- termBufferLength = length;
+ /** Constructs a Token with null text and start & end
+ * offsets plus the Token type.
+ * @param start start offset
+ * @param end end offset */
+ public Token(int start, int end, String typ) {
startOffset = start;
endOffset = end;
+ type = typ;
}
- /** Constructs a Token with the given text, start and end offsets, & type. */
- public Token(String text, int start, int end, String typ) {
+ /** Constructs a Token with the given term text, and start
+ * & end offsets. The type defaults to "word."
+ * <b>NOTE:</b> for better indexing speed you should
+ * instead use the char[] termBuffer methods to set the
+ * term text.
+ * @param text term text
+ * @param start start offset
+ * @param end end offset */
+ public Token(String text, int start, int end) {
termText = text;
startOffset = start;
endOffset = end;
- type = typ;
}
- /** Constructs a Token with the given term text buffer
- * starting at offset for length lenth, and start & end
- * offsets, & type. */
- public Token(char[] text, int offset, int length, int start, int end, String typ) {
- termBuffer = text;
- termBufferOffset = offset;
- termBufferLength = length;
+ /** Constructs a Token with the given text, start and end
+ * offsets, & type. <b>NOTE:</b> for better indexing
+ * speed you should instead use the char[] termBuffer
+ * methods to set the term text.
+ * @param text term text
+ * @param start start offset
+ * @param end end offset
+ * @param typ token type */
+ public Token(String text, int start, int end, String typ) {
+ termText = text;
startOffset = start;
endOffset = end;
type = typ;
}
-
/** Set the position increment. This determines the position of this token
* relative to the previous Token in a {@link TokenStream}, used in phrase
* searching.
@@ -139,28 +183,103 @@
/** Returns the position increment of this Token.
* @see #setPositionIncrement
*/
- public int getPositionIncrement() { return positionIncrement; }
+ public int getPositionIncrement() {
+ return positionIncrement;
+ }
- /** Sets the Token's term text. */
+ /** Sets the Token's term text. <b>NOTE:</b> for better
+ * indexing speed you should instead use the char[]
+ * termBuffer methods to set the term text. */
public void setTermText(String text) {
termText = text;
+ termBuffer = null;
}
- /** Returns the Token's term text. */
- public final String termText() { return termText; }
- public final char[] termBuffer() { return termBuffer; }
- public final int termBufferOffset() { return termBufferOffset; }
- public final int termBufferLength() { return termBufferLength; }
-
- public void setStartOffset(int offset) {this.startOffset = offset;}
- public void setEndOffset(int offset) {this.endOffset = offset;}
+ /** Returns the Token's term text.
+ *
+ * @deprecated Use {@link #termBuffer()} and {@link
+ * #termLength()} instead. */
+ public final String termText() {
+ if (termText == null && termBuffer != null)
+ termText = new String(termBuffer, 0, termLength);
+ return termText;
+ }
+ /** Copies the contents of buffer, starting at offset for
+ * length characters, into the termBuffer
+ * array. <b>NOTE:</b> for better indexing speed you
+ * should instead retrieve the termBuffer, using {@link
+ * #termBuffer()} or {@link #resizeTermBuffer(int)}, and
+ * fill it in directly to set the term text. This saves
+ * an extra copy. */
public final void setTermBuffer(char[] buffer, int offset, int length) {
- this.termBuffer = buffer;
- this.termBufferOffset = offset;
- this.termBufferLength = length;
+ resizeTermBuffer(length);
+ System.arraycopy(buffer, offset, termBuffer, 0, length);
+ termLength = length;
+ }
+
+ /** Returns the internal termBuffer character array which
+ * you can then directly alter. If the array is too
+ * small for your token, use {@link
+ * #resizeTermBuffer(int)} to increase it. After
+ * altering the buffer be sure to call {@link
+ * #setTermLength} to record the number of valid
+ * characters that were placed into the termBuffer. */
+ public final char[] termBuffer() {
+ initTermBuffer();
+ return termBuffer;
+ }
+
+ /** Grows the termBuffer to at least size newSize.
+ * @param newSize minimum size of the new termBuffer
+ * @return newly created termBuffer with length >= newSize
+ */
+ public char[] resizeTermBuffer(int newSize) {
+ initTermBuffer();
+ if (newSize > termBuffer.length) {
+ int size = termBuffer.length;
+ while(size < newSize)
+ size *= 2;
+ char[] newBuffer = new char[size];
+ System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
+ termBuffer = newBuffer;
+ }
+ return termBuffer;
+ }
+
+ // TODO: once we remove the deprecated termText() method
+ // and switch entirely to char[] termBuffer we don't need
+ // to use this method anymore
+ private void initTermBuffer() {
+ if (termBuffer == null) {
+ if (termText == null) {
+ termBuffer = new char[MIN_BUFFER_SIZE];
+ termLength = 0;
+ } else {
+ int length = termText.length();
+ if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE;
+ termBuffer = new char[length];
+ termLength = termText.length();
+ termText.getChars(0, termText.length(), termBuffer, 0);
+ termText = null;
+ }
+ } else if (termText != null)
+ termText = null;
+ }
+
+ /** Return number of valid characters (length of the term)
+ * in the termBuffer array. */
+ public final int termLength() {
+ initTermBuffer();
+ return termLength;
+ }
+
+ /** Set number of valid characters (length of the term) in
+ * the termBuffer array. */
+ public final void setTermLength(int length) {
+ initTermBuffer();
+ termLength = length;
}
-
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
@@ -168,42 +287,72 @@
Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */
- public final int startOffset() { return startOffset; }
+ public final int startOffset() {
+ return startOffset;
+ }
+
+ /** Set the starting offset.
+ @see #startOffset() */
+ public void setStartOffset(int offset) {
+ this.startOffset = offset;
+ }
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. */
- public final int endOffset() { return endOffset; }
+ public final int endOffset() {
+ return endOffset;
+ }
+
+ /** Set the ending offset.
+ @see #endOffset() */
+ public void setEndOffset(int offset) {
+ this.endOffset = offset;
+ }
/** Returns this Token's lexical type. Defaults to "word". */
- public final String type() { return type; }
+ public final String type() {
+ return type;
+ }
+
+ /** Set the lexical type.
+ @see #type() */
+ public final void setType(String type) {
+ this.type = type;
+ }
/**
- * Sets this Token's payload.
+ * Returns this Token's payload.
* <p><font color="#FF0000">
* WARNING: The status of the <b>Payloads</b> feature is experimental.
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*/
// TODO: Remove warning after API has been finalized
- public void setPayload(Payload payload) {
- this.payload = payload;
+ public Payload getPayload() {
+ return this.payload;
}
-
+
/**
- * Returns this Token's payload.
+ * Sets this Token's payload.
* <p><font color="#FF0000">
* WARNING: The status of the <b>Payloads</b> feature is experimental.
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*/
// TODO: Remove warning after API has been finalized
- public Payload getPayload() {
- return this.payload;
+ public void setPayload(Payload payload) {
+ this.payload = payload;
}
-
+
public String toString() {
StringBuffer sb = new StringBuffer();
- sb.append("(" + termText + "," + startOffset + "," + endOffset);
+ sb.append("(");
+ initTermBuffer();
+ if (termBuffer == null)
+ sb.append("null");
+ else
+ sb.append(termBuffer, 0, termLength);
+ sb.append("," + startOffset + "," + endOffset);
if (!type.equals("word"))
sb.append(",type="+type);
if (positionIncrement != 1)
@@ -212,11 +361,14 @@
return sb.toString();
}
- public Object clone() {
- try {
- return super.clone();
- } catch (CloneNotSupportedException e) {
- throw new RuntimeException(e); // shouldn't happen since we implement Cloneable
- }
+ /** Reset all state for this token back to defaults. */
+ public void clear() {
+ payload = null;
+ // Leave termBuffer to allow re-use
+ termLength = 0;
+ termText = null;
+ positionIncrement = 1;
+ startOffset = endOffset = 0;
+ type = DEFAULT_TYPE;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java Fri Aug 10 11:34:33 2007
@@ -22,6 +22,8 @@
/** A TokenFilter is a TokenStream whose input is another token stream.
<p>
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java Fri Aug 10 11:34:33 2007
@@ -29,11 +29,36 @@
<li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
</ul>
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class TokenStream {
- /** Returns the next token in the stream, or null at EOS. */
- public abstract Token next() throws IOException;
+
+ /** Returns the next token in the stream, or null at EOS.
+ * The returned Token is a "full private copy" (not
+ * re-used across calls to next()) but will be slower
+ * than calling {@link #next(Token)} instead.. */
+ public Token next() throws IOException {
+ Token result = next(new Token());
+ return result;
+ }
+
+ /** Returns the next token in the stream, or null at EOS.
+ * When possible, the input Token should be used as the
+ * returned Token (this gives fastest tokenization
+ * performance), but this is not required and a new Token
+ * may be returned. Callers may re-use a single Token
+ * instance for successive calls to this method and must
+ * therefore fully consume the previously returned Token
+ * before calling this method again.
+ * @param result a Token that may or may not be used to
+ * return
+ * @return next token in the stream or null if
+ * end-of-stream was hit*/
+ public Token next(Token result) throws IOException {
+ return next();
+ }
/** Resets this stream to the beginning. This is an
* optional operation, so subclasses may or may not
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Aug 10 11:34:33 2007
@@ -23,6 +23,8 @@
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class.
+ NOTE: subclasses must override at least one of {@link
+ #next()} or {@link #next(Token)}.
*/
public abstract class Tokenizer extends TokenStream {
@@ -40,6 +42,13 @@
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();
+ }
+
+ /** Reset the tokenizer to a new reader. Typically, an
+ * analyzer (in its reusableTokenStream method) will use
+ * this to re-use a previously created tokenizer. */
+ protected void reset(Reader input) throws IOException {
+ this.input = input;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java Fri Aug 10 11:34:33 2007
@@ -18,11 +18,22 @@
*/
import java.io.Reader;
+import java.io.IOException;
/** An Analyzer that uses WhitespaceTokenizer. */
public final class WhitespaceAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
+ }
+
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+ if (tokenizer == null) {
+ tokenizer = new WhitespaceTokenizer(reader);
+ setPreviousTokenStream(tokenizer);
+ } else
+ tokenizer.reset(reader);
+ return tokenizer;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Fri Aug 10 11:34:33 2007
@@ -75,4 +75,23 @@
result = new StopFilter(result, stopSet);
return result;
}
+
+ private class SavedStreams {
+ StandardTokenizer tokenStream;
+ TokenStream filteredTokenStream;
+ };
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+ SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+ if (streams == null) {
+ streams = new SavedStreams();
+ setPreviousTokenStream(streams);
+ streams.tokenStream = new StandardTokenizer(reader);
+ streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
+ streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+ streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
+ } else
+ streams.tokenStream.reset(reader);
+
+ return streams.filteredTokenStream;
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java Fri Aug 10 11:34:33 2007
@@ -18,6 +18,7 @@
*/
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
@@ -37,33 +38,32 @@
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
- public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
- org.apache.lucene.analysis.Token t = input.next();
+ public final Token next(Token result) throws java.io.IOException {
+ Token t = input.next(result);
if (t == null)
return null;
- String text = t.termText();
- String type = t.type();
+ char[] buffer = t.termBuffer();
+ final int bufferLength = t.termLength();
+ final String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's
- (text.endsWith("'s") || text.endsWith("'S"))) {
- return new org.apache.lucene.analysis.Token
- (text.substring(0,text.length()-2),
- t.startOffset(), t.endOffset(), type);
-
+ bufferLength >= 2 &&
+ buffer[bufferLength-2] == '\'' &&
+ (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+ // Strip last 2 characters off
+ t.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
- StringBuffer trimmed = new StringBuffer();
- for (int i = 0; i < text.length(); i++) {
- char c = text.charAt(i);
- if (c != '.')
- trimmed.append(c);
+ int upto = 0;
+ for(int i=0;i<bufferLength;i++) {
+ char c = buffer[i];
+ if (c != '.')
+ buffer[upto++] = c;
}
- return new org.apache.lucene.analysis.Token
- (trimmed.toString(), t.startOffset(), t.endOffset(), type);
-
- } else {
- return t;
+ t.setTermLength(upto);
}
+
+ return t;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Aug 10 11:34:33 2007
@@ -43,6 +43,9 @@
public class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private final StandardTokenizerImpl scanner;
+ void setInput(Reader reader) {
+ this.input = reader;
+ }
/**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
@@ -58,19 +61,19 @@
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
- public Token next() throws IOException {
+ public Token next(Token result) throws IOException {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
return null;
}
- int startPosition = scanner.yychar();
-
- final String tokenImage = scanner.yytext();
- return new Token(tokenImage, startPosition, startPosition
- + tokenImage.length(),
- StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ scanner.getText(result);
+ final int start = scanner.yychar();
+ result.setStartOffset(start);
+ result.setEndOffset(start+result.termLength());
+ result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+ return result;
}
/*
@@ -81,5 +84,10 @@
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
+ }
+
+ public void reset(Reader reader) throws IOException {
+ input = reader;
+ reset();
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java Fri Aug 10 11:34:33 2007
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.1 on 8/8/07 10:18 PM */
+/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
package org.apache.lucene.analysis.standard;
@@ -19,7 +19,15 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.Token;
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ * on 8/9/07 10:15 AM from the specification file
+ * <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ */
class StandardTokenizerImpl {
/** This character denotes the end of file */
@@ -295,6 +303,13 @@
public final int yychar()
{
return yychar;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex Fri Aug 10 11:34:33 2007
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.Token;
+
%%
%class StandardTokenizerImpl
@@ -51,6 +53,13 @@
public final int yychar()
{
return yychar;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Fri Aug 10 11:34:33 2007
@@ -960,27 +960,17 @@
/** Test whether the text for current Posting p equals
* current tokenText. */
- boolean postingEquals(final String tokenString, final char[] tokenText,
- final int tokenTextLen, final int tokenTextOffset) {
+ boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
assert text != null;
int pos = p.textStart & CHAR_BLOCK_MASK;
- if (tokenText == null) {
- // Compare to String
- for(int i=0;i<tokenTextLen;i++)
- if (tokenString.charAt(i) != text[pos++])
- return false;
- return text[pos] == 0xffff;
- } else {
- int tokenPos = tokenTextOffset;
- final int stopAt = tokenTextLen+tokenPos;
- for(;tokenPos<stopAt;pos++,tokenPos++)
- if (tokenText[tokenPos] != text[pos])
- return false;
- return 0xffff == text[pos];
- }
+ int tokenPos = 0;
+ for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+ if (tokenText[tokenPos] != text[pos])
+ return false;
+ return 0xffff == text[pos];
}
/** Compares term text for two Posting instance and
@@ -1241,8 +1231,7 @@
}
int offsetEnd;
- Token token;
- Token localToken = new Token("", 0, 0);
+ Token localToken = new Token();
/* Invert one occurrence of one field in the document */
public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException {
@@ -1251,12 +1240,12 @@
position += analyzer.getPositionIncrementGap(fieldInfo.name);
if (!field.isTokenized()) { // un-tokenized field
- token = localToken;
String stringValue = field.stringValue();
+ Token token = localToken;
token.setTermText(stringValue);
token.setStartOffset(offset);
token.setEndOffset(offset + stringValue.length());
- addPosition();
+ addPosition(token);
offset += stringValue.length();
length++;
} else { // tokenized field
@@ -1282,7 +1271,7 @@
}
// Tokenize field and add to postingTable
- stream = analyzer.tokenStream(fieldInfo.name, reader);
+ stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
}
// reset the TokenStream to the first token
@@ -1290,9 +1279,10 @@
try {
offsetEnd = offset-1;
- for (token = stream.next(); token != null; token = stream.next()) {
+ Token token;
+ while((token = stream.next(localToken)) != null) {
position += (token.getPositionIncrement() - 1);
- addPosition();
+ addPosition(token);
if (++length >= maxFieldLength) {
if (infoStream != null)
infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
@@ -1357,55 +1347,32 @@
* for every term of every document. Its job is to *
* update the postings byte stream (Postings hash) *
* based on the occurence of a single term. */
- private void addPosition() {
+ private void addPosition(Token token) {
final Payload payload = token.getPayload();
- final String tokenString;
- final int tokenTextLen;
- final int tokenTextOffset;
-
// Get the text of this term. Term can either
// provide a String token or offset into a char[]
// array
final char[] tokenText = token.termBuffer();
+ final int tokenTextLen = token.termLength();
int code = 0;
int code2 = 0;
- if (tokenText == null) {
-
- // Fallback to String token
- tokenString = token.termText();
- tokenTextLen = tokenString.length();
- tokenTextOffset = 0;
-
- // Compute hashcode.
- int downto = tokenTextLen;
- while (downto > 0)
- code = (code*31) + tokenString.charAt(--downto);
-
- // System.out.println(" addPosition: field=" + fieldInfo.name + " string=" + tokenString + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset+token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
+ // Compute hashcode
+ int downto = tokenTextLen;
+ while (downto > 0)
+ code = (code*31) + tokenText[--downto];
- } else {
- tokenString = null;
- tokenTextLen = token.termBufferLength();
- tokenTextOffset = token.termBufferOffset();
-
- // Compute hashcode
- int downto = tokenTextLen+tokenTextOffset;
- while (downto > tokenTextOffset)
- code = (code*31) + tokenText[--downto];
-
- // System.out.println(" addPosition: buffer=" + new String(tokenText, tokenTextOffset, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
- }
+ // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
int hashPos = code & postingsHashMask;
// Locate Posting in hash
p = postingsHash[hashPos];
- if (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)) {
+ if (p != null && !postingEquals(tokenText, tokenTextLen)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = code*1347|1;
@@ -1413,7 +1380,7 @@
code += inc;
hashPos = code & postingsHashMask;
p = postingsHash[hashPos];
- } while (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset));
+ } while (p != null && !postingEquals(tokenText, tokenTextLen));
}
final int proxCode;
@@ -1492,10 +1459,7 @@
p.textStart = textUpto + charPool.byteOffset;
charPool.byteUpto += textLen1;
- if (tokenString == null)
- System.arraycopy(tokenText, tokenTextOffset, text, textUpto, tokenTextLen);
- else
- tokenString.getChars(0, tokenTextLen, text, textUpto);
+ System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
text[textUpto+tokenTextLen] = 0xffff;
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java?view=diff&rev=564715&r1=564714&r2=564715
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java Fri Aug 10 11:34:33 2007
@@ -94,7 +94,7 @@
Token token;
while ((token = stream.next()) != null) {
assertTrue(count < tokens.length);
- assertEquals(tokens[count], token.termText);
+ assertEquals(tokens[count], token.termText());
count++;
}
Added: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java?view=auto&rev=564715
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java Fri Aug 10 11:34:33 2007
@@ -0,0 +1,56 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.*;
+import junit.framework.*;
+
+public class TestToken extends TestCase {
+
+ public TestToken(String name) {
+ super(name);
+ }
+
+ public void testToString() throws Exception {
+ char[] b = {'a', 'l', 'o', 'h', 'a'};
+ Token t = new Token("", 0, 5);
+ t.setTermBuffer(b, 0, 5);
+ assertEquals("(aloha,0,5)", t.toString());
+
+ t.setTermText("hi there");
+ assertEquals("(hi there,0,5)", t.toString());
+ }
+
+ public void testMixedStringArray() throws Exception {
+ Token t = new Token("hello", 0, 5);
+ assertEquals(t.termText(), "hello");
+ assertEquals(t.termLength(), 5);
+ assertEquals(new String(t.termBuffer(), 0, 5), "hello");
+ t.setTermText("hello2");
+ assertEquals(t.termLength(), 6);
+ assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
+ t.setTermBuffer("hello3".toCharArray(), 0, 6);
+ assertEquals(t.termText(), "hello3");
+
+ // Make sure if we get the buffer and change a character
+ // that termText() reflects the change
+ char[] buffer = t.termBuffer();
+ buffer[1] = 'o';
+ assertEquals(t.termText(), "hollo3");
+ }
+}
Propchange: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestToken.java
------------------------------------------------------------------------------
svn:eol-style = native