You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/22 13:21:48 UTC
svn commit: r1303746 - in
/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji:
KuromojiAnalyzer.java KuromojiTokenizer.java
Author: rmuir
Date: Thu Mar 22 12:21:48 2012
New Revision: 1303746
URL: http://svn.apache.org/viewvc?rev=1303746&view=rev
Log:
add some more kuromoji javadocs
Modified:
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1303746&r1=1303745&r2=1303746&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Thu Mar 22 12:21:48 2012
@@ -35,6 +35,7 @@ import org.apache.lucene.util.Version;
/**
* Analyzer for Japanese that uses morphological analysis.
+ * @see KuromojiTokenizer
*/
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
private final Mode mode;
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1303746&r1=1303745&r2=1303746&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Mar 22 12:21:48 2012
@@ -47,23 +47,57 @@ import org.apache.lucene.util.fst.FST;
// TODO: somehow factor out a reusable viterbi search here,
// so other decompounders/tokenizers can reuse...
-/* Uses a rolling Viterbi search to find the least cost
- * segmentation (path) of the incoming characters. For
- * tokens that appear to be compound (> length 2 for all
+/**
+ * Tokenizer for Japanese that uses morphological analysis.
+ * <p>
+ * This tokenizer sets a number of additional attributes:
+ * <ul>
+ * <li>{@link BaseFormAttribute} containing base form for inflected
+ * adjectives and verbs.
+ * <li>{@link PartOfSpeechAttribute} containing part-of-speech.
+ * <li>{@link ReadingAttribute} containing reading and pronunciation.
+ * <li>{@link InflectionAttribute} containing additional part-of-speech
+ * information for inflected forms.
+ * </ul>
+ * <p>
+ * This tokenizer uses a rolling Viterbi search to find the
+ * least cost segmentation (path) of the incoming characters.
+ * For tokens that appear to be compound (> length 2 for all
* Kanji, or > length 7 for non-Kanji), we see if there is a
* 2nd best segmentation of that token after applying
* penalties to the long tokens. If so, and the Mode is
- * SEARCH_WITH_COMPOUND, we output the alternate
- * segmentation as well. */
-/**
- * Tokenizer for Japanese that uses morphological analysis.
+ * {@link Mode#SEARCH}, we output the alternate segmentation
+ * as well.
*/
public final class KuromojiTokenizer extends Tokenizer {
+ /**
+ * Tokenization mode: this determines how the tokenizer handles
+ * compound and unknown words.
+ */
public static enum Mode {
- NORMAL, SEARCH, EXTENDED
+ /**
+ * Ordinary segmentation: no decomposition for compounds,
+ */
+ NORMAL,
+
+ /**
+ * Segmentation geared towards search: this includes a
+ * decompounding process for long nouns, also including
+ * the full compound token as a synonym.
+ */
+ SEARCH,
+
+ /**
+ * Extended mode outputs unigrams for unknown words.
+ * @lucene.experimental
+ */
+ EXTENDED
}
+ /**
+ * Default tokenization mode. Currently this is {@link Mode#SEARCH}.
+ */
public static final Mode DEFAULT_MODE = Mode.SEARCH;
enum Type {
@@ -139,6 +173,14 @@ public final class KuromojiTokenizer ext
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
+ /**
+ * Create a new KuromojiTokenizer.
+ *
+ * @param input Reader containing text
+ * @param userDictionary Optional: if non-null, user dictionary.
+ * @param discardPunctuation true if punctuation tokens should be dropped from the output.
+ * @param mode tokenization mode.
+ */
public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
super(input);
dictionary = TokenInfoDictionary.getInstance();