You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/02/16 18:47:07 UTC
svn commit: r1245098 - in /lucene/dev/branches/lucene3767:
lucene/test-framework/src/java/org/apache/lucene/analysis/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/...
Author: mikemccand
Date: Thu Feb 16 17:47:06 2012
New Revision: 1245098
URL: http://svn.apache.org/viewvc?rev=1245098&view=rev
Log:
LUCENE-3767: get toDot working for Viterbi lattice; replace old tokenizer w/ new; leave default at Mode.SEARCH; no more nocommits
Added:
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java (with props)
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
- copied, changed from r1244433, lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
Removed:
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
Modified:
lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
Modified: lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java (original)
+++ lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java Thu Feb 16 17:47:06 2012
@@ -67,7 +67,7 @@ public class TokenStreamToDot {
final boolean isFirst = pos == -1;
int posInc = posIncAtt.getPositionIncrement();
if (isFirst && posInc == 0) {
- // nocommit hmm are TS's still allowed to do this...?
+ // TODO: hmm are TS's still allowed to do this...?
System.err.println("WARNING: first posInc was 0; correcting to 1");
posInc = 1;
}
Added: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java?rev=1245098&view=auto
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java (added)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java Thu Feb 16 17:47:06 2012
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Position;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.WrappedPositionArray;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+
+
+// TODO: would be nice to show 2nd best path in a diff't
+// color...
+
+public class GraphvizFormatter {
+
+ private final static String BOS_LABEL = "BOS";
+
+ private final static String EOS_LABEL = "EOS";
+
+ private final static String FONT_NAME = "Helvetica";
+
+ private final ConnectionCosts costs;
+
+ private final Map<String, String> bestPathMap;
+
+ private final StringBuilder sb = new StringBuilder();
+
+ public GraphvizFormatter(ConnectionCosts costs) {
+ this.costs = costs;
+ this.bestPathMap = new HashMap<String, String>();
+ sb.append(formatHeader());
+ sb.append(" init [style=invis]\n");
+ sb.append(" init -> 0.0 [label=\"BOS\"]\n");
+ }
+
+ public String finish() {
+ sb.append(formatTrailer());
+ return sb.toString();
+ }
+
+ // Backtraces another incremental fragment:
+ void onBacktrace(KuromojiTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
+ setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
+ sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+ if (isEnd) {
+ sb.append(" fini [style=invis]\n");
+ sb.append(" ");
+ sb.append(getNodeID(endPosData.pos, fromIDX));
+ sb.append(" -> fini [label=\"EOS\"]");
+ }
+ }
+
+ // Records which arcs make up the best bath:
+ private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+ bestPathMap.clear();
+
+ int pos = endPosData.pos;
+ int bestIDX = fromIDX;
+ while (pos > startPos) {
+ final Position posData = positions.get(pos);
+
+ final int backPos = posData.backPos[bestIDX];
+ final int backIDX = posData.backIndex[bestIDX];
+
+ final String toNodeID = getNodeID(pos, bestIDX);
+ final String fromNodeID = getNodeID(backPos, backIDX);
+
+ assert !bestPathMap.containsKey(fromNodeID);
+ assert !bestPathMap.containsValue(toNodeID);
+ bestPathMap.put(fromNodeID, toNodeID);
+ pos = backPos;
+ bestIDX = backIDX;
+ }
+ }
+
+ private String formatNodes(KuromojiTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
+
+ StringBuilder sb = new StringBuilder();
+ // Output nodes
+ for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
+ final Position posData = positions.get(pos);
+ for(int idx=0;idx<posData.count;idx++) {
+ sb.append(" ");
+ sb.append(getNodeID(pos, idx));
+ sb.append(" [label=\"");
+ sb.append(pos);
+ sb.append(": ");
+ sb.append(posData.lastRightID[idx]);
+ sb.append("\"]\n");
+ }
+ }
+
+ // Output arcs
+ for (int pos = endPosData.pos; pos > startPos; pos--) {
+ final Position posData = positions.get(pos);
+ for(int idx=0;idx<posData.count;idx++) {
+ final Position backPosData = positions.get(posData.backPos[idx]);
+ final String toNodeID = getNodeID(pos, idx);
+ final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+
+ sb.append(" ");
+ sb.append(fromNodeID);
+ sb.append(" -> ");
+ sb.append(toNodeID);
+
+ final String attrs;
+ if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
+ // This arc is on best path
+ attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
+ } else {
+ attrs = "";
+ }
+
+ final Dictionary dict = tok.getDict(posData.backType[idx]);
+ final int wordCost = dict.getWordCost(posData.backID[idx]);
+ final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
+ dict.getLeftId(posData.backID[idx]));
+
+ final String surfaceForm = new String(fragment,
+ posData.backPos[idx] - startPos,
+ pos - posData.backPos[idx]);
+
+ sb.append(" [label=\"");
+ sb.append(surfaceForm);
+ sb.append(' ');
+ sb.append(wordCost);
+ if (bgCost >= 0) {
+ sb.append('+');
+ }
+ sb.append(bgCost);
+ sb.append("\"");
+ sb.append(attrs);
+ sb.append("]\n");
+ }
+ }
+ return sb.toString();
+ }
+
+ private String formatHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("digraph viterbi {\n");
+ sb.append(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+ //sb.append(" // A2 paper size\n");
+ //sb.append(" size = \"34.4,16.5\";\n");
+ //sb.append(" // try to fill paper\n");
+ //sb.append(" ratio = fill;\n");
+ sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+ sb.append(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+
+ return sb.toString();
+ }
+
+ private String formatTrailer() {
+ return "}";
+ }
+
+ private String getNodeID(int pos, int idx) {
+ return pos + "." + idx;
+ }
+}
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Thu Feb 16 17:47:06 2012
@@ -27,36 +27,26 @@ import org.apache.lucene.analysis.Tokeni
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
- private final Segmenter segmenter;
private final Mode mode;
private final Set<String> stoptags;
private final UserDictionary userDict;
public KuromojiAnalyzer(Version matchVersion) {
- this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+ this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
}
- public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
- super(matchVersion, stopwords);
- this.segmenter = segmenter;
- this.stoptags = stoptags;
- userDict = null;
- mode = Segmenter.DEFAULT_MODE;
- }
-
public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
super(matchVersion, stopwords);
this.userDict = userDict;
this.mode = mode;
this.stoptags = stoptags;
- this.segmenter = null;
}
public static CharArraySet getDefaultStopSet(){
@@ -93,8 +83,7 @@ public class KuromojiAnalyzer extends St
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- //Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, userDict, true, mode);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode);
TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream);
Copied: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (from r1244433, lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?p2=lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java&p1=lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java&r1=1244433&r2=1245098&rev=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Feb 16 17:47:06 2012
@@ -26,7 +26,6 @@ import java.util.EnumMap;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
@@ -35,7 +34,6 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -49,20 +47,27 @@ import org.apache.lucene.util.fst.FST;
// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...
-// nocommit add toDot and look at 1st pass intersection
-
-// nocomit explain how the 2nd best tokenization is
-// "contextual"...
+/* Uses a rolling Viterbi search to find the least cost
+ * segmentation (path) of the incoming characters. For
+ * tokens that appear to be compound (> length 2 for all
+ * Kanji, or > length 7 for non-Kanji), we see if there is a
+ * 2nd best segmentation of that token after applying
+ * penalties to the long tokens. If so, and the Mode is
+ * SEARCH_WITH_COMPOUND, we output the alternate
+ * segmentation as well. */
+public final class KuromojiTokenizer extends Tokenizer {
-// nocommit beast test random data...
+ public static enum Mode {
+ NORMAL, SEARCH, SEARCH_WITH_COMPOUNDS, EXTENDED
+ }
-// nocommit what default mode...?
+ public static final Mode DEFAULT_MODE = Mode.SEARCH;
-/* Uses a rolling Viterbi search to find the least cost
- * segmentation (path) of the incoming characters.
- *
- * @lucene.experimental */
-public final class KuromojiTokenizer2 extends Tokenizer {
+ enum Type {
+ KNOWN,
+ UNKNOWN,
+ USER
+ }
private static final boolean VERBOSE = false;
@@ -127,7 +132,7 @@ public final class KuromojiTokenizer2 ex
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
- public KuromojiTokenizer2(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
+ public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
super(input);
dictionary = TokenInfoDictionary.getInstance();
fst = dictionary.getFST();
@@ -175,6 +180,14 @@ public final class KuromojiTokenizer2 ex
dictionaryMap.put(Type.USER, userDictionary);
}
+ private GraphvizFormatter dotOut;
+
+ /** Expert: set this to produce graphviz (dot) output of
+ * the Viterbi lattice */
+ public void setGraphvizFormatter(GraphvizFormatter dotOut) {
+ this.dotOut = dotOut;
+ }
+
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
@@ -240,7 +253,7 @@ public final class KuromojiTokenizer2 ex
}
// Holds all back pointers arriving to this position:
- private final static class Position {
+ final static class Position {
int pos;
@@ -417,7 +430,7 @@ public final class KuromojiTokenizer2 ex
// TODO: make generic'd version of this "circular array"?
// It's a bit tricky because we do things to the Position
// (eg, set .pos = N on reuse)...
- private static final class WrappedPositionArray {
+ static final class WrappedPositionArray {
private Position[] positions = new Position[8];
public WrappedPositionArray() {
@@ -672,6 +685,7 @@ public final class KuromojiTokenizer2 ex
// In the case of normal mode, it doesn't process unknown word greedily.
if (!searchMode && unknownWordEndIndex > posData.pos) {
+ pos++;
continue;
}
@@ -858,6 +872,10 @@ public final class KuromojiTokenizer2 ex
final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);
+ if (dotOut != null) {
+ dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+ }
+
int pos = endPos;
int bestIDX = fromIDX;
Token altToken = null;
@@ -1093,7 +1111,7 @@ public final class KuromojiTokenizer2 ex
positions.freeBefore(endPos);
}
- private Dictionary getDict(Type type) {
+ Dictionary getDict(Type type) {
return dictionaryMap.get(type);
}
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Thu Feb 16 17:47:06 2012
@@ -17,8 +17,8 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class Token {
private final Dictionary dictionary;
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java Thu Feb 16 17:47:06 2012
@@ -25,19 +25,17 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestExtendedMode extends BaseTokenStreamTestCase {
- private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- //Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.EXTENDED);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java Thu Feb 16 17:47:06 2012
@@ -23,7 +23,7 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java Thu Feb 16 17:47:06 2012
@@ -28,8 +28,7 @@ public class TestKuromojiBaseFormFilter
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- //Tokenizer tokenizer = new KuromojiTokenizer(reader);
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Segmenter.DEFAULT_MODE);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE);
return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
}
};
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Thu Feb 16 17:47:06 2012
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
@@ -28,7 +29,8 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -58,7 +60,15 @@ public class TestKuromojiTokenizer exten
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), false, Mode.SEARCH);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ private Analyzer analyzerNormal = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@@ -66,7 +76,7 @@ public class TestKuromojiTokenizer exten
private Analyzer analyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), true, Mode.SEARCH);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@@ -74,7 +84,7 @@ public class TestKuromojiTokenizer exten
private Analyzer analyzerWithCompounds = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), false, Mode.SEARCH_WITH_COMPOUNDS);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH_WITH_COMPOUNDS);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@@ -82,11 +92,17 @@ public class TestKuromojiTokenizer exten
private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), true, Mode.EXTENDED);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
-
+
+ public void testNormalMode() throws Exception {
+ assertAnalyzesTo(analyzerNormal,
+ "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
+ new String[] {"ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"});
+ }
+
public void testDecomposition1() throws Exception {
assertAnalyzesTo(analyzerNoPunct, "æ¬æ¥ã¯ã貧å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
"ã¢ã¡ãªã«ä½æå¾è
å»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãè人ã«è²»ããã¦ããã",
@@ -312,6 +328,29 @@ public class TestKuromojiTokenizer exten
surfaceForms);
}
+ public void testLatticeToDot() throws Exception {
+ final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+ final Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+ tokenizer.setGraphvizFormatter(gv2);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ String[] surfaceForms = {
+ "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ "ãããããã", "ã"
+ };
+ assertAnalyzesTo(analyzer,
+ input,
+ surfaceForms);
+
+ assertTrue(gv2.finish().indexOf("22.0") != -1);
+ }
+
private void assertReadings(String input, String... readings) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Thu Feb 16 17:47:06 2012
@@ -30,7 +30,7 @@ import java.util.List;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -80,14 +80,15 @@ public class TestQuality extends LuceneT
}
//System.out.println("maxLen=" + maxLen);
- final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
- //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+ final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
tokenizer.reset();
final String all = sb.toString();
+ System.out.println("all.len=" + all.length());
final int ITERS = 20;
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
for(int iter=0;iter<ITERS;iter++) {
tokenizer.reset(new StringReader(all));
+ tokenizer.reset();
count = 0;
long t0 = System.currentTimeMillis();
while(tokenizer.incrementToken()) {
@@ -122,9 +123,7 @@ public class TestQuality extends LuceneT
word agreement?: 0.999587584716181
*/
- //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
- //final Tokenizer tokenizer = new KuromojiTokenizer(new Segmenter(Mode.NORMAL), new StringReader(""));
- final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
+ final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
String line1 = null;
String line2 = null;
int count = 0;
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Thu Feb 16 17:47:06 2012
@@ -28,7 +28,7 @@ import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.util.IOUtils;
public class TestSearchMode extends BaseTokenStreamTestCase {
@@ -37,7 +37,7 @@ public class TestSearchMode extends Base
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.SEARCH);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
@@ -45,7 +45,7 @@ public class TestSearchMode extends Base
private final Analyzer analyzerWithCompounds = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.SEARCH_WITH_COMPOUNDS);
+ Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH_WITH_COMPOUNDS);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Thu Feb 16 17:47:06 2012
@@ -23,29 +23,17 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
- private UserDictionary readDict() throws IOException {
- InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
- if (is == null)
- throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
- try {
- Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
- return new UserDictionary(reader);
- } finally {
- is.close();
- }
- }
-
@Test
public void testLookup() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
String s = "é¢è¥¿å½é空港ã«è¡ã£ã";
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three é¢è¥¿, å½é, 空港
@@ -69,7 +57,7 @@ public class UserDictionaryTest extends
@Test
public void testReadings() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
@@ -83,7 +71,7 @@ public class UserDictionaryTest extends
@Test
public void testPartOfSpeech() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdKeizai = result[1][0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
@@ -92,7 +80,7 @@ public class UserDictionaryTest extends
@Test
public void testRead() throws IOException {
- UserDictionary dictionary = readDict();
+ UserDictionary dictionary = TestKuromojiTokenizer.readDict();
assertNotNull(dictionary);
}
}
Modified: lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Feb 16 17:47:06 2012
@@ -28,8 +28,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory ex
@Override
public Tokenizer create(Reader input) {
- return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+ return new KuromojiTokenizer(input, userDictionary, true, mode);
}
private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory ex
if (mode != null) {
return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
} else {
- return Segmenter.DEFAULT_MODE;
+ return KuromojiTokenizer.DEFAULT_MODE;
}
}
}