You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 04:01:24 UTC
svn commit: r1227035 - in /lucene/dev/branches/lucene3305: ./ lucene/
lucene/src/test-framework/java/org/apache/lucene/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/util/
modules/analysis/common/src/test/org/apache/lucene/analys...
Author: rmuir
Date: Wed Jan 4 03:01:23 2012
New Revision: 1227035
URL: http://svn.apache.org/viewvc?rev=1227035&view=rev
Log:
LUCENE-3305: don't read whole doc into ram, use segmentingtokenizerbase + ja breakiterator, also reduces error rate wrt mecab a bit
Added:
lucene/dev/branches/lucene3305/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/tanakaseg.zip (with props)
Modified:
lucene/dev/branches/lucene3305/ (props changed)
lucene/dev/branches/lucene3305/lucene/ (props changed)
lucene/dev/branches/lucene3305/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
Modified: lucene/dev/branches/lucene3305/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1227035&r1=1227034&r2=1227035&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/lucene3305/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Wed Jan 4 03:01:23 2012
@@ -249,7 +249,46 @@ public abstract class BaseTokenStreamTes
// TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
checkRandomData(random, a, iterations, 20);
+ // now test with multiple threads
+ int numThreads = _TestUtil.nextInt(random, 4, 8);
+ Thread threads[] = new Thread[numThreads];
+ for (int i = 0; i < threads.length; i++) {
+ threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations);
+ }
+ for (int i = 0; i < threads.length; i++) {
+ threads[i].start();
+ }
+ for (int i = 0; i < threads.length; i++) {
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
+
+ static class AnalysisThread extends Thread {
+ final int iterations;
+ final Random random;
+ final Analyzer a;
+
+ AnalysisThread(Random random, Analyzer a, int iterations) {
+ this.random = random;
+ this.a = a;
+ this.iterations = iterations;
+ }
+
+ @Override
+ public void run() {
+ try {
+ // see the part in checkRandomData where it replays the same text again
+ // to verify reproducability/reuse: hopefully this would catch thread hazards.
+ checkRandomData(random, a, iterations, 20);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
for (int i = 0; i < iterations; i++) {
Added: lucene/dev/branches/lucene3305/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java?rev=1227035&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java Wed Jan 4 03:01:23 2012
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import java.text.BreakIterator;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Breaks text into sentences with a {@link BreakIterator} and
+ * allows subclasses to decompose these sentences into words.
+ * <p>
+ * This can be used by subclasses that need sentence context
+ * for tokenization purposes, such as CJK segmenters.
+ * <p>
+ * Additionally it can be used by subclasses that want to mark
+ * sentence boundaries (with a custom attribute, extra token, position
+ * increment, etc) for downstream processing.
+ *
+ * @lucene.experimental
+ */
+public abstract class SegmentingTokenizerBase extends Tokenizer {
+ protected static final int BUFFERMAX = 4096;
+ protected final char buffer[] = new char[BUFFERMAX];
+ /** true length of text in the buffer */
+ private int length = 0;
+ /** length in buffer that can be evaluated safely, up to a safe end point */
+ private int usableLength = 0;
+ /** accumulated offset of previous buffers for this reader, for offsetAtt */
+ protected int offset = 0;
+
+ private final BreakIterator iterator;
+ private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
+
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ /**
+ * Construct a new SegmenterBase from the given Reader, using
+ * the provided BreakIterator for sentence segmentation.
+ * <p>
+ * Note that you should never share BreakIterators across different
+ * TokenStreams, instead a newly created or cloned one should always
+ * be provided to this constructor.
+ */
+ public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
+ super(input);
+ this.iterator = iterator;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (length == 0 || !incrementWord()) {
+ while (!incrementSentence()) {
+ refill();
+ if (length <= 0) // no more bytes to read;
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ wrapper.setText(buffer, 0, 0);
+ iterator.setText(wrapper);
+ length = usableLength = offset = 0;
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ this.input = input;
+ reset();
+ }
+
+ @Override
+ public final void end() throws IOException {
+ final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ /** Returns the last unambiguous break position in the text. */
+ private int findSafeEnd() {
+ for (int i = length - 1; i >= 0; i--)
+ if (isSafeEnd(buffer[i]))
+ return i + 1;
+ return -1;
+ }
+
+ /** For sentence tokenization, these are the unambiguous break positions. */
+ protected boolean isSafeEnd(char ch) {
+ switch(ch) {
+ case 0x000D:
+ case 0x000A:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Refill the buffer, accumulating the offset and setting usableLength to the
+ * last unambiguous break position
+ */
+ private void refill() throws IOException {
+ offset += usableLength;
+ int leftover = length - usableLength;
+ System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+ int requested = buffer.length - leftover;
+ int returned = input.read(buffer, leftover, requested);
+ length = returned < 0 ? leftover : returned + leftover;
+ if (returned < requested) /* reader has been emptied, process the rest */
+ usableLength = length;
+ else { /* still more data to be read, find a safe-stopping place */
+ usableLength = findSafeEnd();
+ if (usableLength < 0)
+ usableLength = length; /*
+ * more than IOBUFFER of text without breaks,
+ * gonna possibly truncate tokens
+ */
+ }
+
+ wrapper.setText(buffer, 0, Math.max(0, usableLength));
+ iterator.setText(wrapper);
+ }
+
+ /**
+ * return true if there is a token from the buffer, or null if it is
+ * exhausted.
+ */
+ private boolean incrementSentence() throws IOException {
+ if (length == 0) // we must refill the buffer
+ return false;
+
+ while (true) {
+ int start = iterator.current();
+
+ if (start == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ // find the next set of boundaries
+ int end = iterator.next();
+
+ if (end == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ setNextSentence(start, end);
+ if (incrementWord()) {
+ return true;
+ }
+ }
+ }
+
+ /** Provides the next input sentence for analysis */
+ protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
+ /** Returns true if another word is available */
+ protected abstract boolean incrementWord();
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?rev=1227035&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Wed Jan 4 03:01:23 2012
@@ -0,0 +1,224 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.BreakIterator;
+import java.util.Arrays;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/** Basic tests for {@link SegmentingTokenizerBase} */
+public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
+ private Analyzer sentence = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ private Analyzer sentenceAndWord = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ /** Some simple examples, just outputting the whole sentence boundaries as "terms" */
+ public void testBasics() throws IOException {
+ assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
+ new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
+ );
+ assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
+ new String[] { "He said, \"Are you going?\" ",
+ "John shook his head." }
+ );
+ }
+
+ /** Test a subclass that sets some custom attribute values */
+ public void testCustomAttributes() throws IOException {
+ assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
+ new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
+ new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
+ new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
+ new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }
+ );
+ }
+
+ /** Tests tokenstream reuse */
+ public void testReuse() throws IOException {
+ assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
+ new String[] { "He", "said", "Are", "you", "going" },
+ new int[] { 0, 3, 10, 14, 18 },
+ new int[] { 2, 7, 13, 17, 23 },
+ new int[] { 1, 1, 1, 1, 1,}
+ );
+ assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
+ new String[] { "John", "shook", "his", "head" },
+ new int[] { 0, 5, 11, 15 },
+ new int[] { 4, 10, 14, 19 },
+ new int[] { 1, 1, 1, 1 }
+ );
+ }
+
+ /** Tests TokenStream.end() */
+ public void testEnd() throws IOException {
+ // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
+ // we add some junk whitespace to the end just to test it.
+ assertAnalyzesTo(sentenceAndWord, "John shook his head ",
+ new String[] { "John", "shook", "his", "head" }
+ );
+ assertAnalyzesTo(sentenceAndWord, "John shook his head. ",
+ new String[] { "John", "shook", "his", "head" }
+ );
+ }
+
+ /** Tests terms which span across boundaries */
+ public void testHugeDoc() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ char whitespace[] = new char[4094];
+ Arrays.fill(whitespace, '\n');
+ sb.append(whitespace);
+ sb.append("testing 1234");
+ String input = sb.toString();
+ assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
+ }
+
+ /** Tests the handling of binary/malformed data */
+ public void testHugeTerm() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 40960; i++) {
+ sb.append('a');
+ }
+ String input = sb.toString();
+ char token[] = new char[4096];
+ Arrays.fill(token, 'a');
+ String expectedToken = new String(token);
+ String expected[] = {
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken
+ };
+ assertAnalyzesTo(sentence, input, expected);
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
+ }
+
+ // some tokenizers for testing
+
+ /** silly tokenizer that just returns whole sentences as tokens */
+ static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
+ int sentenceStart, sentenceEnd;
+ boolean hasSentence;
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ public WholeSentenceTokenizer(Reader input) {
+ super(input, BreakIterator.getSentenceInstance(new Locale("")));
+ }
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ hasSentence = true;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ if (hasSentence) {
+ hasSentence = false;
+ clearAttributes();
+ termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
+ offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * simple tokenizer, that bumps posinc + 1 for tokens after a
+ * sentence boundary to inhibit phrase queries without slop.
+ */
+ static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
+ int sentenceStart, sentenceEnd;
+ int wordStart, wordEnd;
+ int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ public SentenceAndWordTokenizer(Reader input) {
+ super(input, BreakIterator.getSentenceInstance(new Locale("")));
+ }
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ posBoost++;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ posBoost = -1;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ wordStart = wordEnd;
+ while (wordStart < sentenceEnd) {
+ if (Character.isLetterOrDigit(buffer[wordStart]))
+ break;
+ wordStart++;
+ }
+
+ if (wordStart == sentenceEnd) return false;
+
+ wordEnd = wordStart+1;
+ while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
+ wordEnd++;
+
+ clearAttributes();
+ termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
+ offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+ posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
+ posBoost = 0;
+ return true;
+ }
+ }
+}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1227035&r1=1227034&r2=1227035&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Wed Jan 4 03:01:23 2012
@@ -19,77 +19,55 @@ package org.apache.lucene.analysis.kurom
import java.io.IOException;
import java.io.Reader;
+import java.text.BreakIterator;
import java.util.List;
+import java.util.Locale;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
-public final class KuromojiTokenizer extends Tokenizer {
+public final class KuromojiTokenizer extends SegmentingTokenizerBase {
+ private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer;
- private final StringBuilder str = new StringBuilder();
-
- private List<Token> tokens;
-
+ private List<Token> tokens;
private int tokenIndex = 0;
+ private int sentenceStart = 0;
public KuromojiTokenizer(org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer, Reader input) throws IOException {
- super(input);
+ super(input, (BreakIterator) proto.clone());
this.tokenizer = tokenizer;
- // nocommit: this won't really work for large docs.
- // what kind of context does kuromoji need? just sentence maybe?
- fillBuffer(str, input);
- init();
}
- private void init() {
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.sentenceStart = sentenceStart;
+ // TODO: allow the tokenizer, at least maybe doTokenize to take char[] or charsequence or characteriterator?
+ tokens = tokenizer.tokenize(new String(buffer, sentenceStart, sentenceEnd-sentenceStart));
tokenIndex = 0;
- tokens = tokenizer.tokenize(str.toString());
}
-
+
@Override
- public boolean incrementToken() {
- if(tokenIndex == tokens.size()) {
+ protected boolean incrementWord() {
+ if (tokenIndex == tokens.size()) {
return false;
}
-
Token token = tokens.get(tokenIndex);
+ // TODO: we don't really need the surface form except for its length? (its in the buffer already)
String surfaceForm = token.getSurfaceForm();
int position = token.getPosition();
int length = surfaceForm.length();
- int end = position + length;
clearAttributes();
- termAtt.setEmpty().append(str, position, end);
- offsetAtt.setOffset(correctOffset(position), correctOffset(end));
+ termAtt.copyBuffer(buffer, sentenceStart + position, length);
+ int startOffset = offset + sentenceStart + position;
+ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
typeAtt.setType(token.getPartOfSpeech());
tokenIndex++;
return true;
}
-
- @Override
- public void end() {
- final int ofs = correctOffset(str.length());
- offsetAtt.setOffset(ofs, ofs);
- }
-
- @Override
- public void reset(Reader input) throws IOException{
- super.reset(input);
- fillBuffer(str, input);
- init();
- }
-
- final char[] buffer = new char[8192];
- private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
- int len;
- sb.setLength(0);
- while ((len = input.read(buffer)) > 0) {
- sb.append(buffer, 0, len);
- }
- }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1227035&r1=1227034&r2=1227035&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Wed Jan 4 03:01:23 2012
@@ -168,7 +168,7 @@ public class Tokenizer {
private Mode mode = Mode.NORMAL;
- private boolean split = true;
+ private boolean split = false;
private UserDictionary userDictionary = null;
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1227035&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Wed Jan 4 03:01:23 2012
@@ -0,0 +1,228 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.zip.ZipFile;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.LuceneTestCase;
+
+// nocommit: we don't need this or its huge files i dont think?
+// just compares segmentation to some sentences pre-tokenized by mecab
+public class TestQuality extends LuceneTestCase {
+
+ public void test() throws Exception {
+ File datafile = getDataFile("tanakaseg.zip");
+ ZipFile zip = new ZipFile(datafile);
+ InputStream is = zip.getInputStream(zip.getEntry("sentences.txt"));
+ BufferedReader unseg = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ InputStream is2 = zip.getInputStream(zip.getEntry("segmented.txt"));
+ BufferedReader seg = new BufferedReader(new InputStreamReader(is2, "UTF-8"));
+ Stats stats = new Stats();
+ /**
+ #words: 1578506
+ #chars: 4519246
+ #edits: 651
+ #sentences: 150122
+ sentence agreement?: 0.998161495317142
+ word agreement?: 0.999587584716181
+ */
+ final org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer =
+ org.apache.lucene.analysis.kuromoji.Tokenizer.builder().mode(Mode.NORMAL).build();
+ Analyzer testAnalyzer = new KuromojiAnalyzer(tokenizer);
+
+ String line1 = null;
+ String line2 = null;
+ while ((line1 = unseg.readLine()) != null) {
+ line2 = seg.readLine();
+ evaluateLine(line1, line2, testAnalyzer, stats);
+ }
+
+ System.out.println("#words: " + stats.numWords);
+ System.out.println("#chars: " + stats.numChars);
+ System.out.println("#edits: " + stats.numEdits);
+ System.out.println("#sentences: " + stats.numSentences);
+ System.out.println("sentence agreement?: " + (stats.numSentencesCorrect/(double)stats.numSentences));
+ System.out.println("word agreement?: " + (1D - (stats.numEdits / (double)stats.numWords)));
+ unseg.close();
+ seg.close();
+ zip.close();
+ }
+
+ static class Stats {
+ long numWords = 0;
+ long numEdits = 0;
+ long numChars = 0;
+ long numSentences = 0;
+ long numSentencesCorrect = 0;
+ }
+
+ public static void evaluateLine(String unseg, String seg, Analyzer analyzer, Stats stats) throws Exception {
+ List<String> tokens = new ArrayList<String>();
+ TokenStream stream = analyzer.tokenStream("bogus", new StringReader(unseg));
+ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
+ stream.reset();
+ while (stream.incrementToken()) {
+ tokens.add(termAtt.toString());
+ }
+ stream.close();
+
+ List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
+ tokens = normalize(tokens);
+ expectedTokens = normalize(expectedTokens);
+
+ HashMap<String,Character> transformation = new HashMap<String,Character>();
+ CharRef charRef = new CharRef();
+
+ String s1 = transform(tokens, transformation, charRef);
+ String s2 = transform(expectedTokens, transformation, charRef);
+
+ int edits = getDistance(s2, s1);
+ //if (edits > 0) {
+ // System.out.println("unseg: " + unseg);
+ // System.out.println(tokens + " vs " + expectedTokens);
+ //}
+ stats.numChars += seg.length();
+ stats.numEdits += edits;
+ stats.numWords += expectedTokens.size();
+ stats.numSentences++;
+ if (edits == 0)
+ stats.numSentencesCorrect++;
+ }
+
+ static class CharRef {
+ char c = 'a';
+ }
+
+ static String transform(List<String> tokens, HashMap<String,Character> transformation, CharRef ref) {
+ StringBuilder builder = new StringBuilder();
+ for (String token : tokens) {
+ Character value = transformation.get(token);
+
+ if (value == null) {
+ value = new Character(ref.c);
+ ref.c++;
+ transformation.put(token, value);
+ }
+
+ builder.append(value.charValue());
+ }
+ return builder.toString();
+ }
+
+ static List<String> normalize(List<String> tokens) {
+ List<String> newList = new ArrayList<String>();
+ Iterator<String> iterator = tokens.iterator();
+ while (iterator.hasNext()) {
+ String term = iterator.next();
+ if (Character.isLetterOrDigit(term.charAt(0)))
+ newList.add(term);
+ }
+ return newList;
+ }
+
+
+ //*****************************
+ // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+ //*****************************
+ private static int getDistance (String target, String other) {
+ char[] sa;
+ int n;
+ int p[]; //'previous' cost array, horizontally
+ int d[]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+ we maintain two single-dimensional arrays of length s.length()+1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ sa = target.toCharArray();
+ n = sa.length;
+ p = new int[n+1];
+ d = new int[n+1];
+
+ final int m = other.length();
+ if (n == 0 || m == 0) {
+ if (n == m) {
+ return 0;
+ }
+ else {
+ return Math.max(n, m);
+ }
+ }
+
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ char t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i<=n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 1; j<=m; j++) {
+ t_j = other.charAt(j-1);
+ d[0] = j;
+
+ for (i=1; i<=n; i++) {
+ cost = sa[i-1]==t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return Math.abs(p[n]);
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/tanakaseg.zip
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/tanakaseg.zip?rev=1227035&view=auto
==============================================================================
Binary file - no diff available.