You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 19:04:54 UTC

svn commit: r1229660 [1/2] - in /lucene/dev/branches/lucene3305: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ modules/analysis/kuromoji/s...

Author: rmuir
Date: Tue Jan 10 18:04:53 2012
New Revision: 1229660

URL: http://svn.apache.org/viewvc?rev=1229660&view=rev
Log:
LUCENE-3305: flesh-out, expose all atts, option omit punctuation, POS stopword removal with tagset, etc

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
      - copied, changed from r1229589, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
Removed:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
    lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Tue Jan 10 18:04:53 2012
@@ -17,21 +17,76 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
 
-public class KuromojiAnalyzer extends Analyzer {
+public class KuromojiAnalyzer extends StopwordAnalyzerBase {
   private final Segmenter segmenter;
+  private final Set<String> stoptags;
   
-  public KuromojiAnalyzer(Segmenter segmenter) {
+  public KuromojiAnalyzer(Version matchVersion) {
+    this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+  }
+  
+  public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
+    super(matchVersion, stopwords);
     this.segmenter = segmenter;
+    this.stoptags = stoptags;
+  }
+  
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  public static Set<String> getDefaultStopTags(){
+    return DefaultSetHolder.DEFAULT_STOP_TAGS;
+  }
+  
+  /**
+   * Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the 
+   * outer class accesses the static final set the first time.
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+    static final Set<String> DEFAULT_STOP_TAGS;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
+        final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
+        DEFAULT_STOP_TAGS = new HashSet<String>();
+        for (Object element : tagset) {
+          char chars[] = (char[]) element;
+          DEFAULT_STOP_TAGS.add(new String(chars));
+        }
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
   }
   
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
     Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
-    return new TokenStreamComponents(tokenizer, tokenizer);
+    TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
+    stream = new CJKWidthFilter(stream);
+    stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
+    stream = new StopFilter(matchVersion, stream, stopwords);
+    stream = new KuromojiBaseFormFilter(stream);
+    return new TokenStreamComponents(tokenizer, stream);
   }
 }

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Removes tokens that match a set of POS tags.
+ */
+public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
+  private final Set<String> stopTags;
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+
+  public KuromojiPartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
+    super(enablePositionIncrements, input);
+    this.stopTags = stopTags;
+  }
+
+  @Override
+  protected boolean accept() throws IOException {
+    final String pos = posAtt.getPartOfSpeech();
+    return pos == null || !stopTags.contains(pos);
+  }
+}

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Tue Jan 10 18:04:53 2012
@@ -23,7 +23,9 @@ import java.util.List;
 import java.util.Locale;
 
 import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
 import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
@@ -34,12 +36,18 @@ public final class KuromojiTokenizer ext
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
   private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
   private final Segmenter segmenter;
   
   private List<Token> tokens; 
   private int tokenIndex = 0;
   private int sentenceStart = 0;
   
+  public KuromojiTokenizer(Reader input) {
+    this(new Segmenter(), input);
+  }
+  
   public KuromojiTokenizer(Segmenter segmenter, Reader input) {
     super(input, (BreakIterator) proto.clone());
     this.segmenter = segmenter;
@@ -49,7 +57,7 @@ public final class KuromojiTokenizer ext
   protected void setNextSentence(int sentenceStart, int sentenceEnd) {
     this.sentenceStart = sentenceStart;
     // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
-    tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart);
+    tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
     tokenIndex = 0;
   }
 
@@ -67,6 +75,8 @@ public final class KuromojiTokenizer ext
     offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
     basicFormAtt.setToken(token);
     posAtt.setToken(token);
+    readingAtt.setToken(token);
+    inflectionAtt.setToken(token);
     tokenIndex++;
     return true;
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Tue Jan 10 18:04:53 2012
@@ -145,7 +145,7 @@ public class Segmenter {
   
   private List<Token> doTokenize(int offset, String sentence) {
     char text[] = sentence.toCharArray();
-    return doTokenize(offset, text, 0, text.length);
+    return doTokenize(offset, text, 0, text.length, false);
   }
   
   /**
@@ -154,7 +154,7 @@ public class Segmenter {
    * @param sentence sentence to tokenize
    * @return list of Token
    */
-  public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength) {
+  public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
     ArrayList<Token> result = new ArrayList<Token>();
     
     ViterbiNode[][][] lattice;
@@ -168,6 +168,8 @@ public class Segmenter {
       int wordId = node.getWordId();
       if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
         continue;
+      } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
+        continue; // Do not emit punctuation
       }
       Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
       result.add(token);
@@ -189,4 +191,28 @@ public class Segmenter {
     return new GraphvizFormatter(ConnectionCosts.getInstance())
       .format(lattice[0], lattice[1], bestPath);
   }
+  
+  static final boolean isPunctuation(char ch) {
+    switch(Character.getType(ch)) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
 }

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Kuromoji inflection data.
+ * <p>
+ * Note: in some cases this value may not be applicable,
+ * and will be null.
+ */
+public interface InflectionAttribute extends Attribute {
+  public String getInflectionType();
+  public String getInflectionForm();
+  public void setToken(Token token);
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+public class InflectionAttributeImpl extends AttributeImpl implements InflectionAttribute, Cloneable {
+  private Token token;
+  
+  public String getInflectionType() {
+    return token == null ? null : token.getInflectionType();
+  }
+  
+  public String getInflectionForm() {
+    return token == null ? null : token.getInflectionForm();
+  }
+  
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    InflectionAttribute t = (InflectionAttribute) target;
+    t.setToken(token);
+  }
+  
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    String type = getInflectionType();
+    String typeEN = type == null ? null : ToStringUtil.getInflectionTypeTranslation(type);
+    reflector.reflect(InflectionAttribute.class, "inflectionType", type);
+    reflector.reflect(InflectionAttribute.class, "inflectionType (en)", typeEN);
+    String form = getInflectionForm();
+    String formEN = form == null ? null : ToStringUtil.getInflectedFormTranslation(form);
+    reflector.reflect(InflectionAttribute.class, "inflectionForm", form);
+    reflector.reflect(InflectionAttribute.class, "inflectionForm", formEN);
+  }
+}
\ No newline at end of file

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.kurom
  */
 
 import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeReflector;
 
@@ -46,6 +47,8 @@ public class PartOfSpeechAttributeImpl e
   @Override
   public void reflectWith(AttributeReflector reflector) {
     String partOfSpeech = getPartOfSpeech();
+    String partOfSpeechEN = partOfSpeech == null ? null : ToStringUtil.getPOSTranslation(partOfSpeech);
     reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech", partOfSpeech);
+    reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech (en)", partOfSpeechEN);
   }
 }
\ No newline at end of file

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for Kuromoji reading data
+ * <p>
+ * Note: in some cases this value may not be applicable,
+ * and will be null.
+ */
+public interface ReadingAttribute extends Attribute {
+  public String getReading();
+  public String getPronunciation();
+  public void setToken(Token token);
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+public class ReadingAttributeImpl extends AttributeImpl implements ReadingAttribute, Cloneable {
+  private Token token;
+  
+  public String getReading() {
+    return token == null ? null : token.getReading();
+  }
+  
+  public String getPronunciation() {
+    return token == null ? null : token.getPronunciation();
+  }
+  
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    ReadingAttribute t = (ReadingAttribute) target;
+    t.setToken(token);
+  }
+  
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    String reading = getReading();
+    String readingEN = reading == null ? null : ToStringUtil.getRomanization(reading);
+    String pronunciation = getPronunciation();
+    String pronunciationEN = pronunciation == null ? null : ToStringUtil.getRomanization(pronunciation);
+    reflector.reflect(ReadingAttribute.class, "reading", reading);
+    reflector.reflect(ReadingAttribute.class, "reading (en)", readingEN);
+    reflector.reflect(ReadingAttribute.class, "pronunciation", pronunciation);
+    reflector.reflect(ReadingAttribute.class, "pronunciation (en)", pronunciationEN);
+  }
+}
\ No newline at end of file

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java?rev=1229660&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java Tue Jan 10 18:04:53 2012
@@ -0,0 +1,1021 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+public class ToStringUtil {
+  // a translation map for parts of speech, only used for reflectWith
+  private static final HashMap<String,String> posTranslations = new HashMap<String,String>();
+  static {
+    posTranslations.put("名詞", "noun");
+    posTranslations.put("名詞-一般", "noun-common");
+    posTranslations.put("名詞-固有名詞", "noun-proper");
+    posTranslations.put("名詞-固有名詞-一般", "noun-proper-misc");
+    posTranslations.put("名詞-固有名詞-人名", "noun-proper-person");
+    posTranslations.put("名詞-固有名詞-人名-一般", "noun-proper-person-misc");
+    posTranslations.put("名詞-固有名詞-人名-姓", "noun-proper-person-surname");
+    posTranslations.put("名詞-固有名詞-人名-名", "noun-proper-person-given_name");
+    posTranslations.put("名詞-固有名詞-組織", "noun-proper-organization");
+    posTranslations.put("名詞-固有名詞-地域", "noun-proper-place");
+    posTranslations.put("名詞-固有名詞-地域-一般", "noun-proper-place-misc");
+    posTranslations.put("名詞-固有名詞-地域-国", "noun-proper-place-country");
+    posTranslations.put("名詞-代名詞", "noun-pronoun");
+    posTranslations.put("名詞-代名詞-一般", "noun-pronoun-misc");
+    posTranslations.put("名詞-代名詞-縮約", "noun-pronoun-contraction");
+    posTranslations.put("名詞-副詞可能", "noun-adverbial");
+    posTranslations.put("名詞-サ変接続", "noun-verbal");
+    posTranslations.put("名詞-形容動詞語幹", "noun-adjective-base");
+    posTranslations.put("名詞-数", "noun-numeric");
+    posTranslations.put("名詞-非自立", "noun-affix");
+    posTranslations.put("名詞-非自立-一般", "noun-affix-misc");
+    posTranslations.put("名詞-非自立-副詞可能", "noun-affix-adverbial");
+    posTranslations.put("名詞-非自立-助動詞語幹", "noun-affix-aux");
+    posTranslations.put("名詞-非自立-形容動詞語幹", "noun-affix-adjective-base");
+    posTranslations.put("名詞-特殊", "noun-special");
+    posTranslations.put("名詞-特殊-助動詞語幹", "noun-special-aux");
+    posTranslations.put("名詞-接尾", "noun-suffix");
+    posTranslations.put("名詞-接尾-一般", "noun-suffix-misc");
+    posTranslations.put("名詞-接尾-人名", "noun-suffix-person");
+    posTranslations.put("名詞-接尾-地域", "noun-suffix-place");
+    posTranslations.put("名詞-接尾-サ変接続", "noun-suffix-verbal");
+    posTranslations.put("名詞-接尾-助動詞語幹", "noun-suffix-aux");
+    posTranslations.put("名詞-接尾-形容動詞語幹", "noun-suffix-adjective-base");
+    posTranslations.put("名詞-接尾-副詞可能", "noun-suffix-adverbial");
+    posTranslations.put("名詞-接尾-助数詞", "noun-suffix-classifier");
+    posTranslations.put("名詞-接尾-特殊", "noun-suffix-special");
+    posTranslations.put("名詞-接続詞的", "noun-suffix-conjunctive");
+    posTranslations.put("名詞-動詞非自立的", "noun-verbal_aux");
+    posTranslations.put("名詞-引用文字列", "noun-quotation");
+    posTranslations.put("名詞-ナイ形容詞語幹", "noun-nai_adjective");
+    posTranslations.put("接頭詞", "prefix");
+    posTranslations.put("接頭詞-名詞接続", "prefix-nominal");
+    posTranslations.put("接頭詞-動詞接続", "prefix-verbal");
+    posTranslations.put("接頭詞-形容詞接続", "prefix-adjectival");
+    posTranslations.put("接頭詞-数接続", "prefix-numerical");
+    posTranslations.put("動詞", "verb");
+    posTranslations.put("動詞-自立", "verb-main");
+    posTranslations.put("動詞-非自立", "verb-auxiliary");
+    posTranslations.put("動詞-接尾", "verb-suffix");
+    posTranslations.put("形容詞", "adjective");
+    posTranslations.put("形容詞-自立", "adjective-main");
+    posTranslations.put("形容詞-非自立", "adjective-auxiliary");
+    posTranslations.put("形容詞-接尾", "adjective-suffix");
+    posTranslations.put("副詞", "adverb");
+    posTranslations.put("副詞-一般", "adverb-misc");
+    posTranslations.put("副詞-助詞類接続", "adverb-particle_conjunction");
+    posTranslations.put("連体詞", "adnominal");
+    posTranslations.put("接続詞", "conjunction");
+    posTranslations.put("助詞", "particle");
+    posTranslations.put("助詞-格助詞", "particle-case");
+    posTranslations.put("助詞-格助詞-一般", "particle-case-misc");
+    posTranslations.put("助詞-格助詞-引用", "particle-case-quote");
+    posTranslations.put("助詞-格助詞-連語", "particle-case-compound");
+    posTranslations.put("助詞-接続助詞", "particle-conjunctive");
+    posTranslations.put("助詞-係助詞", "particle-dependency");
+    posTranslations.put("助詞-副助詞", "particle-adverbial");
+    posTranslations.put("助詞-間投助詞", "particle-interjective");
+    posTranslations.put("助詞-並立助詞", "particle-coordinate");
+    posTranslations.put("助詞-終助詞", "particle-final");
+    posTranslations.put("助詞-副助詞/並立助詞/終助詞", "particle-adverbial/conjunctive/final");
+    posTranslations.put("助詞-連体化", "particle-adnominalizer");
+    posTranslations.put("助詞-副詞化", "particle-adnominalizer");
+    posTranslations.put("助詞-特殊", "particle-special");
+    posTranslations.put("助動詞", "auxiliary-verb");
+    posTranslations.put("感動詞", "interjection");
+    posTranslations.put("記号", "symbol");
+    posTranslations.put("記号-一般", "symbol-misc");
+    posTranslations.put("記号-句点", "symbol-period");
+    posTranslations.put("記号-読点", "symbol-comma");
+    posTranslations.put("記号-空白", "symbol-space");
+    posTranslations.put("記号-括弧開", "symbol-open_bracket");
+    posTranslations.put("記号-括弧閉", "symbol-close_bracket");
+    posTranslations.put("記号-アルファベット", "symbol-alphabetic");
+    posTranslations.put("その他", "other");
+    posTranslations.put("その他-間投", "other-interjection");
+    posTranslations.put("フィラー", "filler");
+    posTranslations.put("非言語音", "non-verbal");
+    posTranslations.put("語断片", "fragment");
+    posTranslations.put("未知語", "unknown");
+  }
+  
+  /**
+   * Get the english form of a POS tag
+   */
+  public static String getPOSTranslation(String s) {
+    return posTranslations.get(s);
+  }
+  
+  // a translation map for inflection types, only used for reflectWith
+  private static final HashMap<String,String> inflTypeTranslations = new HashMap<String,String>();
+  static {
+    inflTypeTranslations.put("*", "*");
+    inflTypeTranslations.put("形容詞・アウオ段", "adj-group-a-o-u");
+    inflTypeTranslations.put("形容詞・イ段", "adj-group-i");
+    inflTypeTranslations.put("不変化型", "non-inflectional");
+    inflTypeTranslations.put("特殊・タ", "special-da");
+    inflTypeTranslations.put("特殊・ダ", "special-ta");
+    inflTypeTranslations.put("文語・ゴトシ", "classical-gotoshi");
+    inflTypeTranslations.put("特殊・ジャ", "special-ja");
+    inflTypeTranslations.put("特殊・ナイ", "special-nai");
+    inflTypeTranslations.put("五段・ラ行特殊", "5-row-cons-r-special");
+    inflTypeTranslations.put("特殊・ヌ", "special-nu");
+    inflTypeTranslations.put("文語・キ", "classical-ki");
+    inflTypeTranslations.put("特殊・タイ", "special-tai");
+    inflTypeTranslations.put("文語・ベシ", "classical-beshi");
+    inflTypeTranslations.put("特殊・ヤ", "special-ya");
+    inflTypeTranslations.put("文語・マジ", "classical-maji");
+    inflTypeTranslations.put("下二・タ行", "2-row-lower-cons-t");
+    inflTypeTranslations.put("特殊・デス", "special-desu");
+    inflTypeTranslations.put("特殊・マス", "special-masu");
+    inflTypeTranslations.put("五段・ラ行アル", "5-row-aru");
+    inflTypeTranslations.put("文語・ナリ", "classical-nari");
+    inflTypeTranslations.put("文語・リ", "classical-ri");
+    inflTypeTranslations.put("文語・ケリ", "classical-keri");
+    inflTypeTranslations.put("文語・ル", "classical-ru");
+    inflTypeTranslations.put("五段・カ行イ音便", "5-row-cons-k-i-onbin");
+    inflTypeTranslations.put("五段・サ行", "5-row-cons-s");
+    inflTypeTranslations.put("一段", "1-row");
+    inflTypeTranslations.put("五段・ワ行促音便", "5-row-cons-w-cons-onbin");
+    inflTypeTranslations.put("五段・マ行", "5-row-cons-m");
+    inflTypeTranslations.put("五段・タ行", "5-row-cons-t");
+    inflTypeTranslations.put("五段・ラ行", "5-row-cons-r");
+    inflTypeTranslations.put("サ変・−スル", "irregular-suffix-suru");
+    inflTypeTranslations.put("五段・ガ行", "5-row-cons-g");
+    inflTypeTranslations.put("サ変・−ズル", "irregular-suffix-zuru");
+    inflTypeTranslations.put("五段・バ行 ", "5-row-cons-b");
+    inflTypeTranslations.put("五段・ワ行ウ音便", "5-row-cons-w-u-onbin");
+    inflTypeTranslations.put("下二・ダ行", "2-row-lower-cons-d");
+    inflTypeTranslations.put("五段・カ行促音便ユク", "5-row-cons-k-cons-onbin-yuku");
+    inflTypeTranslations.put("上二・ダ行", "2-row-upper-cons-d");
+    inflTypeTranslations.put("五段・カ行促音便", "5-row-cons-k-cons-onbin");
+    inflTypeTranslations.put("一段・得ル", "1-row-eru");
+    inflTypeTranslations.put("四段・タ行", "4-row-cons-t");
+    inflTypeTranslations.put("五段・ナ行", "5-row-cons-n");
+    inflTypeTranslations.put("下二・ハ行", "2-row-lower-cons-h");
+    inflTypeTranslations.put("四段・ハ行", "4-row-cons-h");
+    inflTypeTranslations.put("四段・バ行", "4-row-cons-b");
+    inflTypeTranslations.put("サ変・スル", "irregular-suru");
+    inflTypeTranslations.put("上二・ハ行", "2-row-upper-cons-h");
+    inflTypeTranslations.put("下二・マ行", "2-row-lower-cons-m");
+    inflTypeTranslations.put("四段・サ行", "4-row-cons-s");
+    inflTypeTranslations.put("下二・ガ行", "2-row-lower-cons-g");
+    inflTypeTranslations.put("カ変・来ル", "kuru-kanji");
+    inflTypeTranslations.put("一段・クレル", "1-row-kureru");
+    inflTypeTranslations.put("下二・得", "2-row-lower-u");
+    inflTypeTranslations.put("カ変・クル", "kuru-kana");
+    inflTypeTranslations.put("ラ変", "irregular-cons-r");
+    inflTypeTranslations.put("下二・カ行", "2-row-lower-cons-k");
+  }
+  
+  /**
+   * Get the english form of inflection type
+   */
+  public static String getInflectionTypeTranslation(String s) {
+    return inflTypeTranslations.get(s);
+  }
+
+  // a translation map for inflection forms, only used for reflectWith
+  private static final HashMap<String,String> inflFormTranslations = new HashMap<String,String>();
+  static {
+    inflFormTranslations.put("*", "*");
+    inflFormTranslations.put("基本形", "base");
+    inflFormTranslations.put("文語基本形", "classical-base");
+    inflFormTranslations.put("未然ヌ接続", "imperfective-nu-connection");
+    inflFormTranslations.put("未然ウ接続", "imperfective-u-connection");
+    inflFormTranslations.put("連用タ接続", "conjunctive-ta-connection");
+    inflFormTranslations.put("連用テ接続", "conjunctive-te-connection");
+    inflFormTranslations.put("連用ゴザイ接続", "conjunctive-gozai-connection");
+    inflFormTranslations.put("体言接続", "uninflected-connection");
+    inflFormTranslations.put("仮定形", "subjunctive");
+    inflFormTranslations.put("命令e", "imperative-e");
+    inflFormTranslations.put("仮定縮約1", "conditional-contracted-1");
+    inflFormTranslations.put("仮定縮約2", "conditional-contracted-2");
+    inflFormTranslations.put("ガル接続", "garu-connection");
+    inflFormTranslations.put("未然形", "imperfective");
+    inflFormTranslations.put("連用形", "conjunctive");
+    inflFormTranslations.put("音便基本形", "onbin-base");
+    inflFormTranslations.put("連用デ接続", "conjunctive-de-connection");
+    inflFormTranslations.put("未然特殊", "imperfective-special");
+    inflFormTranslations.put("命令i", "imperative-i");
+    inflFormTranslations.put("連用ニ接続", "conjunctive-ni-connection");
+    inflFormTranslations.put("命令yo", "imperative-yo");
+    inflFormTranslations.put("体言接続特殊", "adnominal-special");
+    inflFormTranslations.put("命令ro", "imperative-ro");
+    inflFormTranslations.put("体言接続特殊2", "uninflected-special-connection-2");
+    inflFormTranslations.put("未然レル接続", "imperfective-reru-connection");
+    inflFormTranslations.put("現代基本形", "modern-base");
+  }
+  
+  /**
+   * Get the english form of inflected form
+   */
+  public static String getInflectedFormTranslation(String s) {
+    return inflFormTranslations.get(s);
+  }
+  
+  /**
+   * Romanize katakana with modified hepburn
+   */
+  public static String getRomanization(String s) {
+    StringBuilder builder = new StringBuilder();
+    final int len = s.length();
+    for (int i = 0; i < len; i++) {
+      // maximum lookahead: 3
+      char ch = s.charAt(i);
+      char ch2 = (i < len - 1) ? s.charAt(i + 1) : 0;
+      char ch3 = (i < len - 2) ? s.charAt(i + 2) : 0;
+      
+      main: switch (ch) {
+        case 'ッ':
+          switch (ch2) {
+            case 'カ':
+            case 'キ':
+            case 'ク':
+            case 'ケ':
+            case 'コ':
+              builder.append('k');
+              break main;
+            case 'サ':
+            case 'シ':
+            case 'ス':
+            case 'セ':
+            case 'ソ':
+              builder.append('s');
+              break main;
+            case 'タ':
+            case 'チ':
+            case 'ツ':
+            case 'テ':
+            case 'ト':
+              builder.append('t');
+              break main;
+            case 'パ':
+            case 'ピ':
+            case 'プ':
+            case 'ペ':
+            case 'ポ':
+              builder.append('p');
+              break main;
+          }
+          break;
+        case 'ア':
+          builder.append('a');
+          break;
+        case 'イ':
+          if (ch2 == 'ィ') {
+            builder.append("yi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("ye");
+            i++;
+          } else {
+            builder.append('i');
+          }
+          break;
+        case 'ウ':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("wa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("wi");
+              i++;
+              break;
+            case 'ゥ':
+              builder.append("wu");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("we");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("wo");
+              i++;
+              break;
+            case 'ュ':
+              builder.append("wyu");
+              i++;
+              break;
+            default:
+              builder.append('u');
+              break;
+          }
+          break;
+        case 'エ':
+          builder.append('e');
+          break;
+        case 'オ':
+          if (ch2 == 'ウ') {
+            builder.append('ō');
+            i++;
+          } else {
+            builder.append('o');
+          }
+          break;
+        case 'カ':
+          builder.append("ka");
+          break;
+        case 'キ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("kyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("kyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("kya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("kyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("kyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("kye");
+            i++;
+          } else {
+            builder.append("ki");
+          }
+          break;
+        case 'ク':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("kwa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("kwi");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("kwe");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("kwo");
+              i++;
+              break;
+            case 'ヮ':
+              builder.append("kwa");
+              i++;
+              break;
+            default:
+              builder.append("ku");
+              break;
+          }
+          break;
+        case 'ケ':
+          builder.append("ke");
+          break;
+        case 'コ':
+          if (ch2 == 'ウ') {
+            builder.append("kō");
+            i++;
+          } else {
+            builder.append("ko");
+          }
+          break;
+        case 'サ':
+          builder.append("sa");
+          break;
+        case 'シ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("shō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("shū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("sha");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("sho");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("shu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("she");
+            i++;
+          } else {
+            builder.append("shi");
+          }
+          break;
+        case 'ス':
+          if (ch2 == 'ィ') {
+            builder.append("si");
+            i++;
+          } else {
+            builder.append("su");
+          }
+          break;
+        case 'セ':
+          builder.append("se");
+          break;
+        case 'ソ':
+          if (ch2 == 'ウ') {
+            builder.append("sō");
+            i++;
+          } else {
+            builder.append("so");
+          }
+          break;
+        case 'タ':
+          builder.append("ta");
+          break;
+        case 'チ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("chō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("chū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("cha");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("cho");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("chu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("che");
+            i++;
+          } else {
+            builder.append("chi");
+          }
+          break;
+        case 'ツ':
+          if (ch2 == 'ァ') {
+            builder.append("tsa");
+            i++;
+          } else if (ch2 == 'ィ') {
+            builder.append("tsi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("tse");
+            i++;
+          } else if (ch2 == 'ォ') {
+            builder.append("tso");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("tsyu");
+            i++;
+          } else {
+            builder.append("tsu");
+          }
+          break;
+        case 'テ':
+          if (ch2 == 'ィ') {
+            builder.append("ti");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("tu");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("tyu");
+            i++;
+          } else {
+            builder.append("te");
+          }
+          break;
+        case 'ト':
+          if (ch2 == 'ウ') {
+            builder.append("tō");
+            i++;
+          } else {
+            builder.append("to");
+          }
+          break;
+        case 'ナ':
+          builder.append("na");
+          break;
+        case 'ニ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("nyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("nyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("nya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("nyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("nyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("nye");
+            i++;
+          } else {
+            builder.append("ni");
+          }
+          break;
+        case 'ヌ':
+          builder.append("nu");
+          break;
+        case 'ネ':
+          builder.append("ne");
+          break;
+        case 'ノ':
+          if (ch2 == 'ウ') {
+            builder.append("nō");
+            i++;
+          } else {
+            builder.append("no");
+          }
+          break;
+        case 'ハ':
+          builder.append("ha");
+          break;
+        case 'ヒ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("hyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("hyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("hya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("hyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("hyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("hye");
+            i++;
+          } else {
+            builder.append("hi");
+          }
+          break;
+        case 'フ':
+          if (ch2 == 'ャ') {
+            builder.append("fya");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("fyu");
+            i++;
+          } else if (ch2 == 'ィ' && ch3 == 'ェ') {
+            builder.append("fye");
+            i+=2;
+          } else if (ch2 == 'ョ') {
+            builder.append("fyo");
+            i++;
+          } else if (ch2 == 'ァ') {
+            builder.append("fa");
+            i++;
+          } else if (ch2 == 'ィ') {
+            builder.append("fi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("fe");
+            i++;
+          } else if (ch2 == 'ォ') {
+            builder.append("fo");
+            i++;
+          } else {
+            builder.append("fu");
+          }
+          break;
+        case 'ヘ':
+          builder.append("he");
+          break;
+        case 'ホ':
+          if (ch2 == 'ウ') {
+            builder.append("hō");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("hu");
+            i++;
+          } else {
+            builder.append("ho");
+          }
+          break;
+        case 'マ':
+          builder.append("ma");
+          break;
+        case 'ミ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("myō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("myū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("mya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("myo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("myu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("mye");
+            i++;
+          } else {
+            builder.append("mi");
+          }
+          break;
+        case 'ム':
+          builder.append("mu");
+          break;
+        case 'メ':
+          builder.append("mi");
+          break;
+        case 'モ':
+          if (ch2 == 'ウ') {
+            builder.append("mō");
+            i++;
+          } else {
+            builder.append("mo");
+          }
+          break;
+        case 'ヤ':
+          builder.append("ya");
+          break;
+        case 'ユ':
+          builder.append("yu");
+          break;
+        case 'ヨ':
+          if (ch2 == 'ウ') {
+            builder.append("yō");
+            i++;
+          } else {
+            builder.append("yo");
+          }
+          break;
+        case 'ラ':
+          builder.append("ra");
+          break;
+        case 'リ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("ryō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("ryū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("rya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("ryo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("ryu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("rye");
+            i++;
+          } else {
+            builder.append("ri");
+          }
+          break;
+        case 'ル':
+          builder.append("ru");
+          break;
+        case 'レ':
+          builder.append("re");
+          break;
+        case 'ロ':
+          if (ch2 == 'ウ') {
+            builder.append("rō");
+            i++;
+          } else {
+            builder.append("ro");
+          }
+          break;
+        case 'ワ':
+          builder.append("wa");
+          break;
+        case 'ヰ':
+          builder.append("i");
+          break;
+        case 'ヱ':
+          builder.append("e");
+          break;
+        case 'ヲ':
+          builder.append("o");
+          break;
+        case 'ン':
+          switch (ch2) {
+            case 'バ':
+            case 'ビ':
+            case 'ブ':
+            case 'ベ':
+            case 'ボ':
+            case 'パ':
+            case 'ピ':
+            case 'プ':
+            case 'ペ':
+            case 'ポ':
+            case 'マ':
+            case 'ミ':
+            case 'ム':
+            case 'メ':
+            case 'モ':
+              builder.append('m');
+              break main;
+            case 'ヤ':
+            case 'ユ':
+            case 'ヨ':
+            case 'ア':
+            case 'イ':
+            case 'ウ':
+            case 'エ':
+            case 'オ':
+              builder.append("n'");
+              break main;
+            default:
+              builder.append("n");
+              break main;
+          }
+        case 'ガ':
+          builder.append("ga");
+          break;
+        case 'ギ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("gyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("gyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("gya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("gyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("gyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("gye");
+            i++;
+          } else {
+            builder.append("gi");
+          }
+          break;
+        case 'グ':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("gwa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("gwi");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("gwe");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("gwo");
+              i++;
+              break;
+            case 'ヮ':
+              builder.append("gwa");
+              i++;
+              break;
+            default:
+              builder.append("gu");
+              break;
+          }
+          break;
+        case 'ゲ':
+          builder.append("ge");
+          break;
+        case 'ゴ':
+          if (ch2 == 'ウ') {
+            builder.append("gō");
+            i++;
+          } else {
+            builder.append("go");
+          }
+          break;
+        case 'ザ':
+          builder.append("za");
+          break;
+        case 'ジ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("jō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("jū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("ja");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("jo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("ju");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("je");
+            i++;
+          } else {
+            builder.append("ji");
+          }
+          break;
+        case 'ズ':
+          if (ch2 == 'ィ') {
+            builder.append("zi");
+            i++;
+          } else {
+            builder.append("zu");
+          }
+          break;
+        case 'ゼ':
+          builder.append("ze");
+          break;
+        case 'ゾ':
+          if (ch2 == 'ウ') {
+            builder.append("zō");
+            i++;
+          } else {
+            builder.append("zo");
+          }
+          break;
+        case 'ダ':
+          builder.append("da");
+          break;
+        case 'ヂ':
+          builder.append("ji");
+          break;
+        case 'ヅ':
+          builder.append("zu");
+          break;
+        case 'デ':
+          if (ch2 == 'ィ') {
+            builder.append("di");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("dyu");
+            i++;
+          } else {
+            builder.append("de");
+          }
+          break;
+        case 'ド':
+          if (ch2 == 'ウ') {
+            builder.append("dō");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("du");
+            i++;
+          } else {
+            builder.append("do");
+          }
+          break;
+        case 'バ':
+          builder.append("ba");
+          break;
+        case 'ビ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("byō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("byū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("bya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("byo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("byu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("bye");
+            i++;
+          } else {
+            builder.append("bi");
+          }
+          break;
+        case 'ブ':
+          builder.append("bu");
+          break;
+        case 'ベ':
+          builder.append("be");
+          break;
+        case 'ボ':
+          if (ch2 == 'ウ') {
+            builder.append("bō");
+            i++;
+          } else {
+            builder.append("bo");
+          }
+          break;
+        case 'パ':
+          builder.append("pa");
+          break;
+        case 'ピ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("pyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("pyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("pya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("pyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("pyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("pye");
+            i++;
+          } else {
+            builder.append("pi");
+          }
+          break;
+        case 'プ':
+          builder.append("pu");
+          break;
+        case 'ペ':
+          builder.append("pe");
+          break;
+        case 'ポ':
+          if (ch2 == 'ウ') {
+            builder.append("pō");
+            i++;
+          } else {
+            builder.append("po");
+          }
+          break;
+        case 'ヴ':
+          if (ch2 == 'ィ' && ch3 == 'ェ') {
+            builder.append("vye");
+            i+= 2;
+          } else {
+            builder.append('v');
+          }
+          break;
+        case 'ァ':
+          builder.append('a');
+          break;
+        case 'ィ':
+          builder.append('i');
+          break;
+        case 'ゥ':
+          builder.append('u');
+          break;
+        case 'ェ':
+          builder.append('e');
+          break;
+        case 'ォ':
+          builder.append('o');
+          break;
+        case 'ヮ':
+          builder.append("wa");
+          break;
+        case 'ャ':
+          builder.append("ya");
+          break;
+        case 'ュ':
+          builder.append("yu");
+          break;
+        case 'ョ':
+          builder.append("yo");
+          break;
+        case 'ー':
+          break;
+        default:
+          builder.append(ch);
+      }
+    }
+    return builder.toString();
+  }
+}

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1229660&r1=1229659&r2=1229660&view=diff
==============================================================================
Binary files - no diff available.