You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 19:04:54 UTC
svn commit: r1229660 [1/2] - in /lucene/dev/branches/lucene3305:
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/
modules/analysis/kuromoji/s...
Author: rmuir
Date: Tue Jan 10 18:04:53 2012
New Revision: 1229660
URL: http://svn.apache.org/viewvc?rev=1229660&view=rev
Log:
LUCENE-3305: flesh-out, expose all atts, option omit punctuation, POS stopword removal with tagset, etc
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
- copied, changed from r1229589, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
Removed:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Tue Jan 10 18:04:53 2012
@@ -17,21 +17,76 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
-public class KuromojiAnalyzer extends Analyzer {
+public class KuromojiAnalyzer extends StopwordAnalyzerBase {
private final Segmenter segmenter;
+ private final Set<String> stoptags;
- public KuromojiAnalyzer(Segmenter segmenter) {
+ public KuromojiAnalyzer(Version matchVersion) {
+ this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+ }
+
+ public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
+ super(matchVersion, stopwords);
this.segmenter = segmenter;
+ this.stoptags = stoptags;
+ }
+
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ public static Set<String> getDefaultStopTags(){
+ return DefaultSetHolder.DEFAULT_STOP_TAGS;
+ }
+
+ /**
+ * Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the
+ * outer class accesses the static final set the first time.
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+ static final Set<String> DEFAULT_STOP_TAGS;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
+ final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
+ DEFAULT_STOP_TAGS = new HashSet<String>();
+ for (Object element : tagset) {
+ char chars[] = (char[]) element;
+ DEFAULT_STOP_TAGS.add(new String(chars));
+ }
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
- return new TokenStreamComponents(tokenizer, tokenizer);
+ TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
+ stream = new CJKWidthFilter(stream);
+ stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
+ stream = new StopFilter(matchVersion, stream, stopwords);
+ stream = new KuromojiBaseFormFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
}
}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Removes tokens that match a set of POS tags.
+ */
+public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
+ private final Set<String> stopTags;
+ private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+
+ public KuromojiPartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
+ super(enablePositionIncrements, input);
+ this.stopTags = stopTags;
+ }
+
+ @Override
+ protected boolean accept() throws IOException {
+ final String pos = posAtt.getPartOfSpeech();
+ return pos == null || !stopTags.contains(pos);
+ }
+}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Tue Jan 10 18:04:53 2012
@@ -23,7 +23,9 @@ import java.util.List;
import java.util.Locale;
import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
@@ -34,12 +36,18 @@ public final class KuromojiTokenizer ext
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+ private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+ private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
private final Segmenter segmenter;
private List<Token> tokens;
private int tokenIndex = 0;
private int sentenceStart = 0;
+ public KuromojiTokenizer(Reader input) {
+ this(new Segmenter(), input);
+ }
+
public KuromojiTokenizer(Segmenter segmenter, Reader input) {
super(input, (BreakIterator) proto.clone());
this.segmenter = segmenter;
@@ -49,7 +57,7 @@ public final class KuromojiTokenizer ext
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
// TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
- tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart);
+ tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
tokenIndex = 0;
}
@@ -67,6 +75,8 @@ public final class KuromojiTokenizer ext
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
basicFormAtt.setToken(token);
posAtt.setToken(token);
+ readingAtt.setToken(token);
+ inflectionAtt.setToken(token);
tokenIndex++;
return true;
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Tue Jan 10 18:04:53 2012
@@ -145,7 +145,7 @@ public class Segmenter {
private List<Token> doTokenize(int offset, String sentence) {
char text[] = sentence.toCharArray();
- return doTokenize(offset, text, 0, text.length);
+ return doTokenize(offset, text, 0, text.length, false);
}
/**
@@ -154,7 +154,7 @@ public class Segmenter {
* @param sentence sentence to tokenize
* @return list of Token
*/
- public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength) {
+ public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
ArrayList<Token> result = new ArrayList<Token>();
ViterbiNode[][][] lattice;
@@ -168,6 +168,8 @@ public class Segmenter {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
continue;
+ } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
+ continue; // Do not emit punctuation
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
@@ -189,4 +191,28 @@ public class Segmenter {
return new GraphvizFormatter(ConnectionCosts.getInstance())
.format(lattice[0], lattice[1], bestPath);
}
+
+ static final boolean isPunctuation(char ch) {
+ switch(Character.getType(ch)) {
+ case Character.SPACE_SEPARATOR:
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ case Character.CONTROL:
+ case Character.FORMAT:
+ case Character.DASH_PUNCTUATION:
+ case Character.START_PUNCTUATION:
+ case Character.END_PUNCTUATION:
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.OTHER_PUNCTUATION:
+ case Character.MATH_SYMBOL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.OTHER_SYMBOL:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ return true;
+ default:
+ return false;
+ }
+ }
}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Kuromoji inflection data.
+ * <p>
+ * Note: in some cases this value may not be applicable,
+ * and will be null.
+ */
+public interface InflectionAttribute extends Attribute {
+ public String getInflectionType();
+ public String getInflectionForm();
+ public void setToken(Token token);
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+public class InflectionAttributeImpl extends AttributeImpl implements InflectionAttribute, Cloneable {
+ private Token token;
+
+ public String getInflectionType() {
+ return token == null ? null : token.getInflectionType();
+ }
+
+ public String getInflectionForm() {
+ return token == null ? null : token.getInflectionForm();
+ }
+
+ public void setToken(Token token) {
+ this.token = token;
+ }
+
+ @Override
+ public void clear() {
+ token = null;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ InflectionAttribute t = (InflectionAttribute) target;
+ t.setToken(token);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ String type = getInflectionType();
+ String typeEN = type == null ? null : ToStringUtil.getInflectionTypeTranslation(type);
+ reflector.reflect(InflectionAttribute.class, "inflectionType", type);
+ reflector.reflect(InflectionAttribute.class, "inflectionType (en)", typeEN);
+ String form = getInflectionForm();
+ String formEN = form == null ? null : ToStringUtil.getInflectedFormTranslation(form);
+ reflector.reflect(InflectionAttribute.class, "inflectionForm", form);
+ reflector.reflect(InflectionAttribute.class, "inflectionForm", formEN);
+ }
+}
\ No newline at end of file
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.kurom
*/
import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
@@ -46,6 +47,8 @@ public class PartOfSpeechAttributeImpl e
@Override
public void reflectWith(AttributeReflector reflector) {
String partOfSpeech = getPartOfSpeech();
+ String partOfSpeechEN = partOfSpeech == null ? null : ToStringUtil.getPOSTranslation(partOfSpeech);
reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech", partOfSpeech);
+ reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech (en)", partOfSpeechEN);
}
}
\ No newline at end of file
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Kuromoji reading data
+ * <p>
+ * Note: in some cases this value may not be applicable,
+ * and will be null.
+ */
+public interface ReadingAttribute extends Attribute {
+ public String getReading();
+ public String getPronunciation();
+ public void setToken(Token token);
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+public class ReadingAttributeImpl extends AttributeImpl implements ReadingAttribute, Cloneable {
+ private Token token;
+
+ public String getReading() {
+ return token == null ? null : token.getReading();
+ }
+
+ public String getPronunciation() {
+ return token == null ? null : token.getPronunciation();
+ }
+
+ public void setToken(Token token) {
+ this.token = token;
+ }
+
+ @Override
+ public void clear() {
+ token = null;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ReadingAttribute t = (ReadingAttribute) target;
+ t.setToken(token);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ String reading = getReading();
+ String readingEN = reading == null ? null : ToStringUtil.getRomanization(reading);
+ String pronunciation = getPronunciation();
+ String pronunciationEN = pronunciation == null ? null : ToStringUtil.getRomanization(pronunciation);
+ reflector.reflect(ReadingAttribute.class, "reading", reading);
+ reflector.reflect(ReadingAttribute.class, "reading (en)", readingEN);
+ reflector.reflect(ReadingAttribute.class, "pronunciation", pronunciation);
+ reflector.reflect(ReadingAttribute.class, "pronunciation (en)", pronunciationEN);
+ }
+}
\ No newline at end of file
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,1021 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+public class ToStringUtil {
+ // a translation map for parts of speech, only used for reflectWith
+ private static final HashMap<String,String> posTranslations = new HashMap<String,String>();
+ static {
+ posTranslations.put("åè©", "noun");
+ posTranslations.put("åè©-ä¸è¬", "noun-common");
+ posTranslations.put("åè©-åºæåè©", "noun-proper");
+ posTranslations.put("åè©-åºæåè©-ä¸è¬", "noun-proper-misc");
+ posTranslations.put("åè©-åºæåè©-人å", "noun-proper-person");
+ posTranslations.put("åè©-åºæåè©-人å-ä¸è¬", "noun-proper-person-misc");
+ posTranslations.put("åè©-åºæåè©-人å-å§", "noun-proper-person-surname");
+ posTranslations.put("åè©-åºæåè©-人å-å", "noun-proper-person-given_name");
+ posTranslations.put("åè©-åºæåè©-çµç¹", "noun-proper-organization");
+ posTranslations.put("åè©-åºæåè©-å°å", "noun-proper-place");
+ posTranslations.put("åè©-åºæåè©-å°å-ä¸è¬", "noun-proper-place-misc");
+ posTranslations.put("åè©-åºæåè©-å°å-å½", "noun-proper-place-country");
+ posTranslations.put("åè©-代åè©", "noun-pronoun");
+ posTranslations.put("åè©-代åè©-ä¸è¬", "noun-pronoun-misc");
+ posTranslations.put("åè©-代åè©-縮ç´", "noun-pronoun-contraction");
+ posTranslations.put("åè©-å¯è©å¯è½", "noun-adverbial");
+ posTranslations.put("åè©-ãµå¤æ¥ç¶", "noun-verbal");
+ posTranslations.put("åè©-形容åè©èªå¹¹", "noun-adjective-base");
+ posTranslations.put("åè©-æ°", "noun-numeric");
+ posTranslations.put("åè©-éèªç«", "noun-affix");
+ posTranslations.put("åè©-éèªç«-ä¸è¬", "noun-affix-misc");
+ posTranslations.put("åè©-éèªç«-å¯è©å¯è½", "noun-affix-adverbial");
+ posTranslations.put("åè©-éèªç«-å©åè©èªå¹¹", "noun-affix-aux");
+ posTranslations.put("åè©-éèªç«-形容åè©èªå¹¹", "noun-affix-adjective-base");
+ posTranslations.put("åè©-ç¹æ®", "noun-special");
+ posTranslations.put("åè©-ç¹æ®-å©åè©èªå¹¹", "noun-special-aux");
+ posTranslations.put("åè©-æ¥å°¾", "noun-suffix");
+ posTranslations.put("åè©-æ¥å°¾-ä¸è¬", "noun-suffix-misc");
+ posTranslations.put("åè©-æ¥å°¾-人å", "noun-suffix-person");
+ posTranslations.put("åè©-æ¥å°¾-å°å", "noun-suffix-place");
+ posTranslations.put("åè©-æ¥å°¾-ãµå¤æ¥ç¶", "noun-suffix-verbal");
+ posTranslations.put("åè©-æ¥å°¾-å©åè©èªå¹¹", "noun-suffix-aux");
+ posTranslations.put("åè©-æ¥å°¾-形容åè©èªå¹¹", "noun-suffix-adjective-base");
+ posTranslations.put("åè©-æ¥å°¾-å¯è©å¯è½", "noun-suffix-adverbial");
+ posTranslations.put("åè©-æ¥å°¾-å©æ°è©", "noun-suffix-classifier");
+ posTranslations.put("åè©-æ¥å°¾-ç¹æ®", "noun-suffix-special");
+ posTranslations.put("åè©-æ¥ç¶è©ç", "noun-suffix-conjunctive");
+ posTranslations.put("åè©-åè©éèªç«ç", "noun-verbal_aux");
+ posTranslations.put("åè©-å¼ç¨æåå", "noun-quotation");
+ posTranslations.put("åè©-ãã¤å½¢å®¹è©èªå¹¹", "noun-nai_adjective");
+ posTranslations.put("æ¥é è©", "prefix");
+ posTranslations.put("æ¥é è©-åè©æ¥ç¶", "prefix-nominal");
+ posTranslations.put("æ¥é è©-åè©æ¥ç¶", "prefix-verbal");
+ posTranslations.put("æ¥é è©-形容è©æ¥ç¶", "prefix-adjectival");
+ posTranslations.put("æ¥é è©-æ°æ¥ç¶", "prefix-numerical");
+ posTranslations.put("åè©", "verb");
+ posTranslations.put("åè©-èªç«", "verb-main");
+ posTranslations.put("åè©-éèªç«", "verb-auxiliary");
+ posTranslations.put("åè©-æ¥å°¾", "verb-suffix");
+ posTranslations.put("形容è©", "adjective");
+ posTranslations.put("形容è©-èªç«", "adjective-main");
+ posTranslations.put("形容è©-éèªç«", "adjective-auxiliary");
+ posTranslations.put("形容è©-æ¥å°¾", "adjective-suffix");
+ posTranslations.put("å¯è©", "adverb");
+ posTranslations.put("å¯è©-ä¸è¬", "adverb-misc");
+ posTranslations.put("å¯è©-å©è©é¡æ¥ç¶", "adverb-particle_conjunction");
+ posTranslations.put("é£ä½è©", "adnominal");
+ posTranslations.put("æ¥ç¶è©", "conjunction");
+ posTranslations.put("å©è©", "particle");
+ posTranslations.put("å©è©-æ ¼å©è©", "particle-case");
+ posTranslations.put("å©è©-æ ¼å©è©-ä¸è¬", "particle-case-misc");
+ posTranslations.put("å©è©-æ ¼å©è©-å¼ç¨", "particle-case-quote");
+ posTranslations.put("å©è©-æ ¼å©è©-é£èª", "particle-case-compound");
+ posTranslations.put("å©è©-æ¥ç¶å©è©", "particle-conjunctive");
+ posTranslations.put("å©è©-ä¿å©è©", "particle-dependency");
+ posTranslations.put("å©è©-å¯å©è©", "particle-adverbial");
+ posTranslations.put("å©è©-éæå©è©", "particle-interjective");
+ posTranslations.put("å©è©-並ç«å©è©", "particle-coordinate");
+ posTranslations.put("å©è©-çµå©è©", "particle-final");
+ posTranslations.put("å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©", "particle-adverbial/conjunctive/final");
+ posTranslations.put("å©è©-é£ä½å", "particle-adnominalizer");
+ posTranslations.put("å©è©-å¯è©å", "particle-adnominalizer");
+ posTranslations.put("å©è©-ç¹æ®", "particle-special");
+ posTranslations.put("å©åè©", "auxiliary-verb");
+ posTranslations.put("æåè©", "interjection");
+ posTranslations.put("è¨å·", "symbol");
+ posTranslations.put("è¨å·-ä¸è¬", "symbol-misc");
+ posTranslations.put("è¨å·-å¥ç¹", "symbol-period");
+ posTranslations.put("è¨å·-èªç¹", "symbol-comma");
+ posTranslations.put("è¨å·-空ç½", "symbol-space");
+ posTranslations.put("è¨å·-æ¬å¼§é", "symbol-open_bracket");
+ posTranslations.put("è¨å·-æ¬å¼§é", "symbol-close_bracket");
+ posTranslations.put("è¨å·-ã¢ã«ãã¡ããã", "symbol-alphabetic");
+ posTranslations.put("ãã®ä»", "other");
+ posTranslations.put("ãã®ä»-éæ", "other-interjection");
+ posTranslations.put("ãã£ã©ã¼", "filler");
+ posTranslations.put("éè¨èªé³", "non-verbal");
+ posTranslations.put("èªæç", "fragment");
+ posTranslations.put("æªç¥èª", "unknown");
+ }
+
+ /**
+ * Get the english form of a POS tag
+ */
+ public static String getPOSTranslation(String s) {
+ return posTranslations.get(s);
+ }
+
+ // a translation map for inflection types, only used for reflectWith
+ private static final HashMap<String,String> inflTypeTranslations = new HashMap<String,String>();
+ static {
+ inflTypeTranslations.put("*", "*");
+ inflTypeTranslations.put("形容è©ã»ã¢ã¦ãªæ®µ", "adj-group-a-o-u");
+ inflTypeTranslations.put("形容è©ã»ã¤æ®µ", "adj-group-i");
+ inflTypeTranslations.put("ä¸å¤åå", "non-inflectional");
+ inflTypeTranslations.put("ç¹æ®ã»ã¿", "special-da");
+ inflTypeTranslations.put("ç¹æ®ã»ã", "special-ta");
+ inflTypeTranslations.put("æèªã»ã´ãã·", "classical-gotoshi");
+ inflTypeTranslations.put("ç¹æ®ã»ã¸ã£", "special-ja");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¤", "special-nai");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡ç¹æ®", "5-row-cons-r-special");
+ inflTypeTranslations.put("ç¹æ®ã»ã", "special-nu");
+ inflTypeTranslations.put("æèªã»ã", "classical-ki");
+ inflTypeTranslations.put("ç¹æ®ã»ã¿ã¤", "special-tai");
+ inflTypeTranslations.put("æèªã»ãã·", "classical-beshi");
+ inflTypeTranslations.put("ç¹æ®ã»ã¤", "special-ya");
+ inflTypeTranslations.put("æèªã»ãã¸", "classical-maji");
+ inflTypeTranslations.put("ä¸äºã»ã¿è¡", "2-row-lower-cons-t");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¹", "special-desu");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¹", "special-masu");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡ã¢ã«", "5-row-aru");
+ inflTypeTranslations.put("æèªã»ããª", "classical-nari");
+ inflTypeTranslations.put("æèªã»ãª", "classical-ri");
+ inflTypeTranslations.put("æèªã»ã±ãª", "classical-keri");
+ inflTypeTranslations.put("æèªã»ã«", "classical-ru");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ã¤é³ä¾¿", "5-row-cons-k-i-onbin");
+ inflTypeTranslations.put("äºæ®µã»ãµè¡", "5-row-cons-s");
+ inflTypeTranslations.put("ä¸æ®µ", "1-row");
+ inflTypeTranslations.put("äºæ®µã»ã¯è¡ä¿é³ä¾¿", "5-row-cons-w-cons-onbin");
+ inflTypeTranslations.put("äºæ®µã»ãè¡", "5-row-cons-m");
+ inflTypeTranslations.put("äºæ®µã»ã¿è¡", "5-row-cons-t");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡", "5-row-cons-r");
+ inflTypeTranslations.put("ãµå¤ã»âã¹ã«", "irregular-suffix-suru");
+ inflTypeTranslations.put("äºæ®µã»ã¬è¡", "5-row-cons-g");
+ inflTypeTranslations.put("ãµå¤ã»âãºã«", "irregular-suffix-zuru");
+ inflTypeTranslations.put("äºæ®µã»ãè¡ ", "5-row-cons-b");
+ inflTypeTranslations.put("äºæ®µã»ã¯è¡ã¦é³ä¾¿", "5-row-cons-w-u-onbin");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-d");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ä¿é³ä¾¿ã¦ã¯", "5-row-cons-k-cons-onbin-yuku");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-upper-cons-d");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ä¿é³ä¾¿", "5-row-cons-k-cons-onbin");
+ inflTypeTranslations.put("ä¸æ®µã»å¾ã«", "1-row-eru");
+ inflTypeTranslations.put("å段ã»ã¿è¡", "4-row-cons-t");
+ inflTypeTranslations.put("äºæ®µã»ãè¡", "5-row-cons-n");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-h");
+ inflTypeTranslations.put("å段ã»ãè¡", "4-row-cons-h");
+ inflTypeTranslations.put("å段ã»ãè¡", "4-row-cons-b");
+ inflTypeTranslations.put("ãµå¤ã»ã¹ã«", "irregular-suru");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-upper-cons-h");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-m");
+ inflTypeTranslations.put("å段ã»ãµè¡", "4-row-cons-s");
+ inflTypeTranslations.put("ä¸äºã»ã¬è¡", "2-row-lower-cons-g");
+ inflTypeTranslations.put("ã«å¤ã»æ¥ã«", "kuru-kanji");
+ inflTypeTranslations.put("ä¸æ®µã»ã¯ã¬ã«", "1-row-kureru");
+ inflTypeTranslations.put("ä¸äºã»å¾", "2-row-lower-u");
+ inflTypeTranslations.put("ã«å¤ã»ã¯ã«", "kuru-kana");
+ inflTypeTranslations.put("ã©å¤", "irregular-cons-r");
+ inflTypeTranslations.put("ä¸äºã»ã«è¡", "2-row-lower-cons-k");
+ }
+
+ /**
+ * Get the english form of inflection type
+ */
+ public static String getInflectionTypeTranslation(String s) {
+ return inflTypeTranslations.get(s);
+ }
+
+ // a translation map for inflection forms, only used for reflectWith
+ private static final HashMap<String,String> inflFormTranslations = new HashMap<String,String>();
+ static {
+ inflFormTranslations.put("*", "*");
+ inflFormTranslations.put("åºæ¬å½¢", "base");
+ inflFormTranslations.put("æèªåºæ¬å½¢", "classical-base");
+ inflFormTranslations.put("æªç¶ãæ¥ç¶", "imperfective-nu-connection");
+ inflFormTranslations.put("æªç¶ã¦æ¥ç¶", "imperfective-u-connection");
+ inflFormTranslations.put("é£ç¨ã¿æ¥ç¶", "conjunctive-ta-connection");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-te-connection");
+ inflFormTranslations.put("é£ç¨ã´ã¶ã¤æ¥ç¶", "conjunctive-gozai-connection");
+ inflFormTranslations.put("ä½è¨æ¥ç¶", "uninflected-connection");
+ inflFormTranslations.put("ä»®å®å½¢", "subjunctive");
+ inflFormTranslations.put("å½ä»¤ï½
", "imperative-e");
+ inflFormTranslations.put("ä»®å®ç¸®ç´ï¼", "conditional-contracted-1");
+ inflFormTranslations.put("ä»®å®ç¸®ç´ï¼", "conditional-contracted-2");
+ inflFormTranslations.put("ã¬ã«æ¥ç¶", "garu-connection");
+ inflFormTranslations.put("æªç¶å½¢", "imperfective");
+ inflFormTranslations.put("é£ç¨å½¢", "conjunctive");
+ inflFormTranslations.put("é³ä¾¿åºæ¬å½¢", "onbin-base");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-de-connection");
+ inflFormTranslations.put("æªç¶ç¹æ®", "imperfective-special");
+ inflFormTranslations.put("å½ä»¤ï½", "imperative-i");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-ni-connection");
+ inflFormTranslations.put("å½ä»¤ï½ï½", "imperative-yo");
+ inflFormTranslations.put("ä½è¨æ¥ç¶ç¹æ®", "adnominal-special");
+ inflFormTranslations.put("å½ä»¤ï½ï½", "imperative-ro");
+ inflFormTranslations.put("ä½è¨æ¥ç¶ç¹æ®ï¼", "uninflected-special-connection-2");
+ inflFormTranslations.put("æªç¶ã¬ã«æ¥ç¶", "imperfective-reru-connection");
+ inflFormTranslations.put("ç¾ä»£åºæ¬å½¢", "modern-base");
+ }
+
+ /**
+ * Get the english form of inflected form
+ */
+ public static String getInflectedFormTranslation(String s) {
+ return inflFormTranslations.get(s);
+ }
+
+ /**
+ * Romanize katakana with modified hepburn
+ */
+ public static String getRomanization(String s) {
+ StringBuilder builder = new StringBuilder();
+ final int len = s.length();
+ for (int i = 0; i < len; i++) {
+ // maximum lookahead: 3
+ char ch = s.charAt(i);
+ char ch2 = (i < len - 1) ? s.charAt(i + 1) : 0;
+ char ch3 = (i < len - 2) ? s.charAt(i + 2) : 0;
+
+ main: switch (ch) {
+ case 'ã':
+ switch (ch2) {
+ case 'ã«':
+ case 'ã':
+ case 'ã¯':
+ case 'ã±':
+ case 'ã³':
+ builder.append('k');
+ break main;
+ case 'ãµ':
+ case 'ã·':
+ case 'ã¹':
+ case 'ã»':
+ case 'ã½':
+ builder.append('s');
+ break main;
+ case 'ã¿':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ builder.append('t');
+ break main;
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ builder.append('p');
+ break main;
+ }
+ break;
+ case 'ã¢':
+ builder.append('a');
+ break;
+ case 'ã¤':
+ if (ch2 == 'ã£') {
+ builder.append("yi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("ye");
+ i++;
+ } else {
+ builder.append('i');
+ }
+ break;
+ case 'ã¦':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("wa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("wi");
+ i++;
+ break;
+ case 'ã¥':
+ builder.append("wu");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("we");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("wo");
+ i++;
+ break;
+ case 'ã¥':
+ builder.append("wyu");
+ i++;
+ break;
+ default:
+ builder.append('u');
+ break;
+ }
+ break;
+ case 'ã¨':
+ builder.append('e');
+ break;
+ case 'ãª':
+ if (ch2 == 'ã¦') {
+ builder.append('Å');
+ i++;
+ } else {
+ builder.append('o');
+ }
+ break;
+ case 'ã«':
+ builder.append("ka");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("kyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("kyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("kya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("kyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("kyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("kye");
+ i++;
+ } else {
+ builder.append("ki");
+ }
+ break;
+ case 'ã¯':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("kwa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("kwi");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("kwe");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("kwo");
+ i++;
+ break;
+ case 'ã®':
+ builder.append("kwa");
+ i++;
+ break;
+ default:
+ builder.append("ku");
+ break;
+ }
+ break;
+ case 'ã±':
+ builder.append("ke");
+ break;
+ case 'ã³':
+ if (ch2 == 'ã¦') {
+ builder.append("kÅ");
+ i++;
+ } else {
+ builder.append("ko");
+ }
+ break;
+ case 'ãµ':
+ builder.append("sa");
+ break;
+ case 'ã·':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("shÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("shū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("sha");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("sho");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("shu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("she");
+ i++;
+ } else {
+ builder.append("shi");
+ }
+ break;
+ case 'ã¹':
+ if (ch2 == 'ã£') {
+ builder.append("si");
+ i++;
+ } else {
+ builder.append("su");
+ }
+ break;
+ case 'ã»':
+ builder.append("se");
+ break;
+ case 'ã½':
+ if (ch2 == 'ã¦') {
+ builder.append("sÅ");
+ i++;
+ } else {
+ builder.append("so");
+ }
+ break;
+ case 'ã¿':
+ builder.append("ta");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("chÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("chū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("cha");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("cho");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("chu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("che");
+ i++;
+ } else {
+ builder.append("chi");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¡') {
+ builder.append("tsa");
+ i++;
+ } else if (ch2 == 'ã£') {
+ builder.append("tsi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("tse");
+ i++;
+ } else if (ch2 == 'ã©') {
+ builder.append("tso");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tsyu");
+ i++;
+ } else {
+ builder.append("tsu");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("ti");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tu");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tyu");
+ i++;
+ } else {
+ builder.append("te");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("tÅ");
+ i++;
+ } else {
+ builder.append("to");
+ }
+ break;
+ case 'ã':
+ builder.append("na");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("nyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("nyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("nya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("nyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("nyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("nye");
+ i++;
+ } else {
+ builder.append("ni");
+ }
+ break;
+ case 'ã':
+ builder.append("nu");
+ break;
+ case 'ã':
+ builder.append("ne");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("nÅ");
+ i++;
+ } else {
+ builder.append("no");
+ }
+ break;
+ case 'ã':
+ builder.append("ha");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("hyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("hyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("hya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("hyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("hyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("hye");
+ i++;
+ } else {
+ builder.append("hi");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("fya");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("fyu");
+ i++;
+ } else if (ch2 == 'ã£' && ch3 == 'ã§') {
+ builder.append("fye");
+ i+=2;
+ } else if (ch2 == 'ã§') {
+ builder.append("fyo");
+ i++;
+ } else if (ch2 == 'ã¡') {
+ builder.append("fa");
+ i++;
+ } else if (ch2 == 'ã£') {
+ builder.append("fi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("fe");
+ i++;
+ } else if (ch2 == 'ã©') {
+ builder.append("fo");
+ i++;
+ } else {
+ builder.append("fu");
+ }
+ break;
+ case 'ã':
+ builder.append("he");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("hÅ");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("hu");
+ i++;
+ } else {
+ builder.append("ho");
+ }
+ break;
+ case 'ã':
+ builder.append("ma");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("myÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("myū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("mya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("myo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("myu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("mye");
+ i++;
+ } else {
+ builder.append("mi");
+ }
+ break;
+ case 'ã ':
+ builder.append("mu");
+ break;
+ case 'ã¡':
+ builder.append("mi");
+ break;
+ case 'ã¢':
+ if (ch2 == 'ã¦') {
+ builder.append("mÅ");
+ i++;
+ } else {
+ builder.append("mo");
+ }
+ break;
+ case 'ã¤':
+ builder.append("ya");
+ break;
+ case 'ã¦':
+ builder.append("yu");
+ break;
+ case 'ã¨':
+ if (ch2 == 'ã¦') {
+ builder.append("yÅ");
+ i++;
+ } else {
+ builder.append("yo");
+ }
+ break;
+ case 'ã©':
+ builder.append("ra");
+ break;
+ case 'ãª':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("ryÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("ryū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("rya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("ryo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("ryu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("rye");
+ i++;
+ } else {
+ builder.append("ri");
+ }
+ break;
+ case 'ã«':
+ builder.append("ru");
+ break;
+ case 'ã¬':
+ builder.append("re");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("rÅ");
+ i++;
+ } else {
+ builder.append("ro");
+ }
+ break;
+ case 'ã¯':
+ builder.append("wa");
+ break;
+ case 'ã°':
+ builder.append("i");
+ break;
+ case 'ã±':
+ builder.append("e");
+ break;
+ case 'ã²':
+ builder.append("o");
+ break;
+ case 'ã³':
+ switch (ch2) {
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã ':
+ case 'ã¡':
+ case 'ã¢':
+ builder.append('m');
+ break main;
+ case 'ã¤':
+ case 'ã¦':
+ case 'ã¨':
+ case 'ã¢':
+ case 'ã¤':
+ case 'ã¦':
+ case 'ã¨':
+ case 'ãª':
+ builder.append("n'");
+ break main;
+ default:
+ builder.append("n");
+ break main;
+ }
+ case 'ã¬':
+ builder.append("ga");
+ break;
+ case 'ã®':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("gyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("gyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("gya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("gyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("gyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("gye");
+ i++;
+ } else {
+ builder.append("gi");
+ }
+ break;
+ case 'ã°':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("gwa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("gwi");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("gwe");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("gwo");
+ i++;
+ break;
+ case 'ã®':
+ builder.append("gwa");
+ i++;
+ break;
+ default:
+ builder.append("gu");
+ break;
+ }
+ break;
+ case 'ã²':
+ builder.append("ge");
+ break;
+ case 'ã´':
+ if (ch2 == 'ã¦') {
+ builder.append("gÅ");
+ i++;
+ } else {
+ builder.append("go");
+ }
+ break;
+ case 'ã¶':
+ builder.append("za");
+ break;
+ case 'ã¸':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("jÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("jū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("ja");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("jo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("ju");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("je");
+ i++;
+ } else {
+ builder.append("ji");
+ }
+ break;
+ case 'ãº':
+ if (ch2 == 'ã£') {
+ builder.append("zi");
+ i++;
+ } else {
+ builder.append("zu");
+ }
+ break;
+ case 'ã¼':
+ builder.append("ze");
+ break;
+ case 'ã¾':
+ if (ch2 == 'ã¦') {
+ builder.append("zÅ");
+ i++;
+ } else {
+ builder.append("zo");
+ }
+ break;
+ case 'ã':
+ builder.append("da");
+ break;
+ case 'ã':
+ builder.append("ji");
+ break;
+ case 'ã
':
+ builder.append("zu");
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("di");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("dyu");
+ i++;
+ } else {
+ builder.append("de");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("dÅ");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("du");
+ i++;
+ } else {
+ builder.append("do");
+ }
+ break;
+ case 'ã':
+ builder.append("ba");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("byÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("byū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("bya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("byo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("byu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("bye");
+ i++;
+ } else {
+ builder.append("bi");
+ }
+ break;
+ case 'ã':
+ builder.append("bu");
+ break;
+ case 'ã':
+ builder.append("be");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("bÅ");
+ i++;
+ } else {
+ builder.append("bo");
+ }
+ break;
+ case 'ã':
+ builder.append("pa");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("pyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("pyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("pya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("pyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("pyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("pye");
+ i++;
+ } else {
+ builder.append("pi");
+ }
+ break;
+ case 'ã':
+ builder.append("pu");
+ break;
+ case 'ã':
+ builder.append("pe");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("pÅ");
+ i++;
+ } else {
+ builder.append("po");
+ }
+ break;
+ case 'ã´':
+ if (ch2 == 'ã£' && ch3 == 'ã§') {
+ builder.append("vye");
+ i+= 2;
+ } else {
+ builder.append('v');
+ }
+ break;
+ case 'ã¡':
+ builder.append('a');
+ break;
+ case 'ã£':
+ builder.append('i');
+ break;
+ case 'ã¥':
+ builder.append('u');
+ break;
+ case 'ã§':
+ builder.append('e');
+ break;
+ case 'ã©':
+ builder.append('o');
+ break;
+ case 'ã®':
+ builder.append("wa");
+ break;
+ case 'ã£':
+ builder.append("ya");
+ break;
+ case 'ã¥':
+ builder.append("yu");
+ break;
+ case 'ã§':
+ builder.append("yo");
+ break;
+ case 'ã¼':
+ break;
+ default:
+ builder.append(ch);
+ }
+ }
+ return builder.toString();
+ }
+}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
Binary files - no diff available.