You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 21:10:52 UTC
svn commit: r1230748 [1/5] - in /lucene/dev/trunk: dev-tools/eclipse/ lucene/contrib/ modules/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/util/ modules/analysis/common/src/test/org/apache/lucene/analysis/util/ modules/analysis...

Author: rmuir
Date: Thu Jan 12 20:10:48 2012
New Revision: 1230748

URL: http://svn.apache.org/viewvc?rev=1230748&view=rev
Log:
LUCENE-3305: add Kuromoji Japanese morphological analyzer

Added:
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java   (with props)
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/build.xml   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/BaseFormAttribute.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/BaseFormAttributeImpl.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttribute.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/bocchan.utf-8   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/util/
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/util/TestToStringUtil.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/CharacterDefinitionWriter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java   (with props)
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/StringMockSolrResourceLoader.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiBaseFormFilterFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiPartOfSpeechStopFilterFactory.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java   (with props)
Modified:
    lucene/dev/trunk/dev-tools/eclipse/dot.classpath
    lucene/dev/trunk/lucene/contrib/contrib-build.xml
    lucene/dev/trunk/modules/analysis/CHANGES.txt
    lucene/dev/trunk/modules/analysis/NOTICE.txt
    lucene/dev/trunk/modules/analysis/README.txt
    lucene/dev/trunk/modules/analysis/build.xml
    lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
    lucene/dev/trunk/solr/contrib/analysis-extras/README.txt
    lucene/dev/trunk/solr/contrib/analysis-extras/build.xml
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java

Modified: lucene/dev/trunk/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/eclipse/dot.classpath?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/trunk/dev-tools/eclipse/dot.classpath Thu Jan 12 20:10:48 2012
@@ -24,6 +24,9 @@
 	<classpathentry kind="src" path="modules/analysis/icu/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/icu/src/resources"/>
 	<classpathentry kind="src" path="modules/analysis/icu/src/test"/>
+	<classpathentry kind="src" path="modules/analysis/kuromoji/src/java"/>
+	<classpathentry kind="src" path="modules/analysis/kuromoji/src/resources"/>
+	<classpathentry kind="src" path="modules/analysis/kuromoji/src/test"/>
 	<classpathentry kind="src" path="modules/analysis/phonetic/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/phonetic/src/test"/>
 	<classpathentry kind="src" path="modules/analysis/smartcn/src/java"/>

Modified: lucene/dev/trunk/lucene/contrib/contrib-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/contrib-build.xml?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/contrib-build.xml (original)
+++ lucene/dev/trunk/lucene/contrib/contrib-build.xml Thu Jan 12 20:10:48 2012
@@ -196,6 +196,17 @@
     <property name="analyzers-stempel.uptodate" value="true"/>
   </target>
 
+  <property name="analyzers-kuromoji.jar" value="${common.dir}/../modules/analysis/build/kuromoji/lucene-analyzers-kuromoji-${version}.jar"/>
+  <target name="check-analyzers-kuromoji-uptodate" unless="analyzers-kuromoji.uptodate">
+    <module-uptodate name="analysis/kuromoji" jarfile="${analyzers-kuromoji.jar}" property="analyzers-kuromoji.uptodate"/>
+  </target>
+  <target name="jar-analyzers-kuromoji" unless="analyzers-kuromoji.uptodate" depends="check-analyzers-kuromoji-uptodate">
+  	<ant dir="${common.dir}/../modules/analysis/kuromoji" target="jar-core" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+    <property name="analyzers-kuromoji.uptodate" value="true"/>
+  </target>
+
   <property name="grouping.jar" value="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"/>
   <target name="check-grouping-uptodate" unless="grouping.uptodate">
     <module-uptodate name="grouping" jarfile="${grouping.jar}" property="grouping.uptodate"/>

Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Thu Jan 12 20:10:48 2012
@@ -42,6 +42,11 @@ API Changes
    since they prevent reuse.  Both Analyzers should be configured at instantiation.
    (Chris Male)
 
+ * LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
+   with BreakIterator and allows subclasses to decompose sentences into words, or
+   use the sentence boundary information for other reasons (e.g. attribute/position increment)
+   (Robert Muir)
+
 New Features
 
  * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer 
@@ -109,6 +114,9 @@ New Features
  * LUCENE-3414: Added HunspellStemFilter which uses a provided pure Java implementation of the 
    Hunspell algorithm. (Chris Male)
 
+ * LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
+   (Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Robert Muir)
+
 Build
 
  * LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the 

Modified: lucene/dev/trunk/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/NOTICE.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/trunk/modules/analysis/NOTICE.txt Thu Jan 12 20:10:48 2012
@@ -71,3 +71,86 @@ LGPL and Creative Commons ShareAlike.
 
 Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
 (http://sgjp.pl/morfeusz/)
+
+===========================================================================
+Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+  mecab-ipadic-2.7.0-20070801
+
+which can be obtained from
+
+  http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
+
+or
+
+  http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
+
+===========================================================================
+mecab-ipadic-2.7.0-20070801 Notice
+===========================================================================
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software.  The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis.  Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise.  The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter.  Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program.  The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.

Modified: lucene/dev/trunk/modules/analysis/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/README.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/README.txt (original)
+++ lucene/dev/trunk/modules/analysis/README.txt Thu Jan 12 20:10:48 2012
@@ -22,6 +22,12 @@ lucene-analyzers-icu-XX.jar
   International Components for Unicode (ICU). Note: this module depends on
   the ICU4j jar file (version >= 4.6.0)
 
+lucene-analyzers-kuromoji-XX.jar
+  An analyzer with morphological analysis for Japanese.
+
+lucene-analyzers-morfologik-XX.jar
+  An analyzer using the Morfologik stemming library.
+
 lucene-analyzers-phonetic-XX.jar
   An add-on analysis library that provides phonetic encoders via Apache
   Commons-Codec. Note: this module depends on the commons-codec jar 
@@ -35,21 +41,20 @@ lucene-analyzers-stempel-XX.jar
   An add-on analysis library that contains a universal algorithmic stemmer,
   including tables for the Polish language.
 
-lucene-analyzers-morfologik-XX.jar
-  An analyzer using the Morfologik stemming library.
-
 common/src/java
 icu/src/java
+kuromoji/src/java
+morfologik/src/java
 phonetic/src/java
 smartcn/src/java
 stempel/src/java
-morfologik/src/java
-  The source code for the ffve libraries.
+  The source code for the libraries.
 
 common/src/test
 icu/src/test
+kuromoji/src/test
+morfologik/src/test
 phonetic/src/test
 smartcn/src/test
 stempel/src/test
-morfologik/src/test
-  Unit tests for the five libraries.
+  Unit tests for the libraries.

Modified: lucene/dev/trunk/modules/analysis/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/build.xml?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/build.xml Thu Jan 12 20:10:48 2012
@@ -23,9 +23,10 @@
     Additional Analyzers
       - common:	Additional Analyzers
       - icu: Analyzers that use functionality from ICU
+      - kuromoji:       Japanese Morphological Analyzer
+      - morfologik:	Morfologik Stemmer
       - smartcn:	Smart Analyzer for Simplified Chinese Text
       - stempel:	Algorithmic Stemmer for Polish
-      - morfologik:	Morfologik Stemmer
   </description>
 
   <target name="common">
@@ -36,6 +37,14 @@
     <ant dir="icu" />
   </target>
 
+  <target name="kuromoji">
+    <ant dir="kuromoji" />
+  </target>
+
+  <target name="morfologik">
+    <ant dir="morfologik" />
+  </target>
+
   <target name="phonetic">
     <ant dir="phonetic" />
   </target>
@@ -48,52 +57,53 @@
     <ant dir="stempel" />
   </target>
 
-  <target name="morfologik">
-    <ant dir="morfologik" />
-  </target>
-
   <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
+  <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel" />
 
   <target name="clean">
     <ant dir="common" target="clean" />
     <ant dir="icu" target="clean" />
+    <ant dir="kuromoji" target="clean"/>
+    <ant dir="morfologik" target="clean" />
     <ant dir="phonetic" target="clean" />
     <ant dir="smartcn" target="clean" />
     <ant dir="stempel" target="clean" />
-    <ant dir="morfologik" target="clean" />
   </target>
   <target name="validate">
     <ant dir="common" target="validate" />
     <ant dir="icu" target="validate" />
+    <ant dir="kuromoji" target="validate" />
+    <ant dir="morfologik" target="validate" />
     <ant dir="phonetic" target="validate" />
     <ant dir="smartcn" target="validate" />
     <ant dir="stempel" target="validate" />
-    <ant dir="morfologik" target="validate" />
   </target>
   <target name="compile-core">
     <ant dir="common" target="compile-core" />
     <ant dir="icu" target="compile-core" />
+    <ant dir="kuromoji" target="compile-core" />
+    <ant dir="morfologik" target="compile-core" />
     <ant dir="phonetic" target="compile-core" />
     <ant dir="smartcn" target="compile-core" />
     <ant dir="stempel" target="compile-core" />
-    <ant dir="morfologik" target="compile-core" />
   </target>
   <target name="compile-test">
     <ant dir="common" target="compile-test" />
     <ant dir="icu" target="compile-test" />
+    <ant dir="kuromoji" target="compile-test" />
+    <ant dir="morfologik" target="compile-test" />
     <ant dir="phonetic" target="compile-test" />
     <ant dir="smartcn" target="compile-test" />
     <ant dir="stempel" target="compile-test" />
-    <ant dir="morfologik" target="compile-test" />
   </target>
   <target name="test">
     <ant dir="common" target="test" />
     <ant dir="icu" target="test" />
+    <ant dir="kuromoji" target="test" />
+    <ant dir="morfologik" target="test" />
     <ant dir="phonetic" target="test" />
     <ant dir="smartcn" target="test" />
     <ant dir="stempel" target="test" />
-    <ant dir="morfologik" target="test" />
   </target>
 
   <target name="build-artifacts-and-tests" depends="default,compile-test" />
@@ -101,28 +111,31 @@
   <target name="dist-maven" depends="default,javadocs">
     <ant dir="common" target="dist-maven" />
     <ant dir="icu" target="dist-maven" />
+    <ant dir="kuromoji" target="dist-maven" />
+    <ant dir="morfologik" target="dist-maven" />
     <ant dir="phonetic" target="dist-maven" />
     <ant dir="smartcn" target="dist-maven" />
     <ant dir="stempel" target="dist-maven" />
-    <ant dir="morfologik" target="dist-maven" />
   </target>  	
 
   <target name="javadocs">
     <ant dir="common" target="javadocs" />
     <ant dir="icu" target="javadocs" />
+    <ant dir="kuromoji" target="javadocs" />
+    <ant dir="morfologik" target="javadocs" />
     <ant dir="phonetic" target="javadocs" />
     <ant dir="smartcn" target="javadocs" />
     <ant dir="stempel" target="javadocs" />
-    <ant dir="morfologik" target="javadocs" />
   </target>  	
 
   <target name="javadocs-index.html">
     <ant dir="common" target="javadocs-index.html" />
     <ant dir="icu" target="javadocs-index.html" />
+    <ant dir="kuromoji" target="javadocs-index.html" />
+    <ant dir="morfologik" target="javadocs-index.html" />
     <ant dir="phonetic" target="javadocs-index.html" />
     <ant dir="smartcn" target="javadocs-index.html" />
     <ant dir="stempel" target="javadocs-index.html" />
-    <ant dir="morfologik" target="javadocs-index.html" />
   </target>
 	
 </project>

Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import java.text.BreakIterator;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Breaks text into sentences with a {@link BreakIterator} and
+ * allows subclasses to decompose these sentences into words.
+ * <p>
+ * This can be used by subclasses that need sentence context 
+ * for tokenization purposes, such as CJK segmenters.
+ * <p>
+ * Additionally it can be used by subclasses that want to mark
+ * sentence boundaries (with a custom attribute, extra token, position
+ * increment, etc) for downstream processing.
+ * 
+ * @lucene.experimental
+ */
+public abstract class SegmentingTokenizerBase extends Tokenizer {
+  protected static final int BUFFERMAX = 4096;
+  protected final char buffer[] = new char[BUFFERMAX];
+  /** true length of text in the buffer */
+  private int length = 0; 
+  /** length in buffer that can be evaluated safely, up to a safe end point */
+  private int usableLength = 0; 
+  /** accumulated offset of previous buffers for this reader, for offsetAtt */
+  protected int offset = 0;
+  
+  private final BreakIterator iterator;
+  private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
+
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  /**
+   * Construct a new SegmenterBase from the given Reader, using
+   * the provided BreakIterator for sentence segmentation.
+   * <p>
+   * Note that you should never share BreakIterators across different
+   * TokenStreams, instead a newly created or cloned one should always
+   * be provided to this constructor.
+   */
+  public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
+    super(input);
+    this.iterator = iterator;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (length == 0 || !incrementWord()) {
+      while (!incrementSentence()) {
+        refill();
+        if (length <= 0) // no more bytes to read;
+          return false;
+      }
+    }
+    
+    return true;
+  }
+  
+  @Override
+  public void reset() throws IOException {
+    wrapper.setText(buffer, 0, 0);
+    iterator.setText(wrapper);
+    length = usableLength = offset = 0;
+  }
+
+  @Override
+  public void reset(Reader input) throws IOException {
+    this.input = input;
+    reset();
+  }
+  
+  @Override
+  public final void end() throws IOException {
+    final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }  
+
+  /** Returns the last unambiguous break position in the text. */
+  private int findSafeEnd() {
+    for (int i = length - 1; i >= 0; i--)
+      if (isSafeEnd(buffer[i]))
+        return i + 1;
+    return -1;
+  }
+  
+  /** For sentence tokenization, these are the unambiguous break positions. */
+  protected boolean isSafeEnd(char ch) {
+    switch(ch) {
+      case 0x000D:
+      case 0x000A:
+      case 0x0085:
+      case 0x2028:
+      case 0x2029:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  /**
+   * Refill the buffer, accumulating the offset and setting usableLength to the
+   * last unambiguous break position
+   */
+  private void refill() throws IOException {
+    offset += usableLength;
+    int leftover = length - usableLength;
+    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+    int requested = buffer.length - leftover;
+    int returned = input.read(buffer, leftover, requested);
+    length = returned < 0 ? leftover : returned + leftover;
+    if (returned < requested) /* reader has been emptied, process the rest */
+      usableLength = length;
+    else { /* still more data to be read, find a safe-stopping place */
+      usableLength = findSafeEnd();
+      if (usableLength < 0)
+        usableLength = length; /*
+                                * more than IOBUFFER of text without breaks,
+                                * gonna possibly truncate tokens
+                                */
+    }
+
+    wrapper.setText(buffer, 0, Math.max(0, usableLength));
+    iterator.setText(wrapper);
+  }
+
+  /**
+   * return true if there is a token from the buffer, or null if it is
+   * exhausted.
+   */
+  private boolean incrementSentence() throws IOException {
+    if (length == 0) // we must refill the buffer
+      return false;
+    
+    while (true) {
+      int start = iterator.current();
+
+      if (start == BreakIterator.DONE)
+        return false; // BreakIterator exhausted
+
+      // find the next set of boundaries
+      int end = iterator.next();
+
+      if (end == BreakIterator.DONE)
+        return false; // BreakIterator exhausted
+
+      setNextSentence(start, end);
+      if (incrementWord()) {
+        return true;
+      }
+    }
+  }
+  
+  /** Provides the next input sentence for analysis */
+  protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
+  /** Returns true if another word is available */
+  protected abstract boolean incrementWord();
+}

Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,224 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.BreakIterator;
+import java.util.Arrays;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/** Basic tests for {@link SegmentingTokenizerBase} */
+public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
+  private Analyzer sentence = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+  
+  private Analyzer sentenceAndWord = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+  
+  /** Some simple examples, just outputting the whole sentence boundaries as "terms" */
+  public void testBasics() throws IOException {
+    assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
+        new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
+    );
+    assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
+        new String[] { "He said, \"Are you going?\" ", 
+                       "John shook his head." }
+    );
+  }
+  
+  /** Test a subclass that sets some custom attribute values */
+  public void testCustomAttributes() throws IOException {
+    assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
+        new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
+        new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
+        new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
+        new int[] { 1, 1,  1,  1,  1,  2,  1,  1,  1 }
+    );
+  }
+  
+  /** Tests tokenstream reuse */
+  public void testReuse() throws IOException {
+    assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
+        new String[] { "He", "said", "Are", "you", "going" },
+        new int[] { 0, 3, 10, 14, 18 },
+        new int[] { 2, 7, 13, 17, 23 },
+        new int[] { 1, 1,  1,  1,  1,}
+    );
+    assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
+        new String[] { "John", "shook", "his", "head" },
+        new int[] { 0,  5, 11, 15 },
+        new int[] { 4, 10, 14, 19 },
+        new int[] { 1,  1,  1,  1 }
+    );
+  }
+  
+  /** Tests TokenStream.end() */
+  public void testEnd() throws IOException {
+    // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
+    // we add some junk whitespace to the end just to test it.
+    assertAnalyzesTo(sentenceAndWord, "John shook his head          ",
+        new String[] { "John", "shook", "his", "head" }
+    );
+    assertAnalyzesTo(sentenceAndWord, "John shook his head.          ",
+        new String[] { "John", "shook", "his", "head" }
+    );
+  }
+  
+  /** Tests terms which span across boundaries */
+  public void testHugeDoc() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    char whitespace[] = new char[4094];
+    Arrays.fill(whitespace, '\n');
+    sb.append(whitespace);
+    sb.append("testing 1234");
+    String input = sb.toString();
+    assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
+  }
+  
+  /** Tests the handling of binary/malformed data */
+  public void testHugeTerm() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 40960; i++) {
+      sb.append('a');
+    }
+    String input = sb.toString();
+    char token[] = new char[4096];
+    Arrays.fill(token, 'a');
+    String expectedToken = new String(token);
+    String expected[] = { 
+        expectedToken, expectedToken, expectedToken, 
+        expectedToken, expectedToken, expectedToken,
+        expectedToken, expectedToken, expectedToken,
+        expectedToken
+    };
+    assertAnalyzesTo(sentence, input, expected);
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
+  }
+
+  // some tokenizers for testing
+  
+  /** silly tokenizer that just returns whole sentences as tokens */
+  static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
+    int sentenceStart, sentenceEnd;
+    boolean hasSentence;
+    
+    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    
+    public WholeSentenceTokenizer(Reader input) {
+      super(input, BreakIterator.getSentenceInstance(new Locale("")));
+    }
+
+    @Override
+    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+      this.sentenceStart = sentenceStart;
+      this.sentenceEnd = sentenceEnd;
+      hasSentence = true;
+    }
+
+    @Override
+    protected boolean incrementWord() {
+      if (hasSentence) {
+        hasSentence = false;
+        clearAttributes();
+        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
+        offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+  
+  /** 
+   * simple tokenizer, that bumps posinc + 1 for tokens after a 
+   * sentence boundary to inhibit phrase queries without slop.
+   */
+  static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
+    int sentenceStart, sentenceEnd;
+    int wordStart, wordEnd;
+    int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
+    
+    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    
+    public SentenceAndWordTokenizer(Reader input) {
+      super(input, BreakIterator.getSentenceInstance(new Locale("")));
+    }
+
+    @Override
+    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+      this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
+      this.sentenceEnd = sentenceEnd;
+      posBoost++;
+    }
+    
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      posBoost = -1;
+    }
+
+    @Override
+    protected boolean incrementWord() {
+      wordStart = wordEnd;
+      while (wordStart < sentenceEnd) {
+        if (Character.isLetterOrDigit(buffer[wordStart]))
+          break;
+        wordStart++;
+      }
+      
+      if (wordStart == sentenceEnd) return false;
+      
+      wordEnd = wordStart+1;
+      while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
+        wordEnd++;
+      
+      clearAttributes();
+      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
+      offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
+      posBoost = 0;
+      return true;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/build.xml?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/build.xml (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/build.xml Thu Jan 12 20:10:48 2012
@@ -0,0 +1,121 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-kuromoji" default="default">
+
+  <description>
+    Kuromoji Japanese Morphological Analyzer
+  </description>
+	
+  <property name="build.dir" location="../build/kuromoji" />
+  <property name="dist.dir" location="../dist/kuromoji" />
+  <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
+  <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
+  <!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
+  <property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
+  <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
+  <property name="dict.encoding" value="euc-jp"/>
+  <property name="dict.format" value="ipadic"/>
+  <property name="dict.normalize" value="false"/>
+  <property name="dict.target.dir" location="./src/resources"/>
+  <import file="../../../lucene/contrib/contrib-build.xml"/> 
+
+  <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+  <target name="download-dict" unless="dict.available">
+     <get src="${dict.url}" dest="${build.dir}/${dict.src.file}"/>
+     <gunzip src="${build.dir}/${dict.src.file}"/>
+     <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
+  </target>
+
+  <path id="tools.dependencies">
+    <fileset dir="../icu/lib" includes="icu4j-*.jar"/>
+  </path>
+
+  <path id="tools.classpath">
+    <path refid="classpath"/>
+    <path refid="tools.dependencies"/>
+    <pathelement location="${build.dir}/classes/java"/>
+    <pathelement location="${build.dir}/classes/tools"/>
+  </path>
+
+  <path id="tools.test.classpath">
+    <path refid="tools.classpath"/>
+    <path refid="test.base.classpath"/>
+    <pathelement location="${build.dir}/classes/tools-test"/>
+  </path>
+
+  <target name="build-dict" depends="compile-tools, download-dict">
+    <sequential>
+      <delete verbose="true">
+        <fileset dir="src/resources/org/apache/lucene/analysis/kuromoji/dict" includes="**/*"/>
+      </delete>
+      <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
+        <classpath>
+          <path refid="tools.classpath"/>
+          <pathelement path="${build.dir}/classes/tools"/>
+        </classpath>
+        <assertions>
+          <enable package="org.apache.lucene"/>
+        </assertions>
+        <arg value="${dict.format}"/>
+        <arg value="${dict.src.dir}"/>
+        <arg value="${dict.target.dir}"/>
+        <arg value="${dict.encoding}"/>
+        <arg value="${dict.normalize}"/>
+      </java>
+    </sequential>
+  </target>
+
+  <target name="compile-tools" depends="compile-core, common.compile-tools">
+    <compile
+      srcdir="src/tools/java"
+      destdir="${build.dir}/classes/tools">
+      <classpath>
+        <path refid="tools.classpath"/>
+        <pathelement path="src/tools/java"/>
+      </classpath>
+    </compile>
+  </target>
+
+  <target name="compile-tools-tests" depends="compile-tools">
+    <compile
+      srcdir="src/tools/test"
+      destdir="${build.dir}/classes/tools-test">
+      <classpath>
+        <path refid="tools.test.classpath"/>
+        <pathelement path="src/tools/test"/>
+      </classpath>
+     </compile>
+  </target>
+
+  <target name="test-tools" depends="compile-tools-tests">
+    <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+  </target>
+
+  <target name="compile-test" depends="contrib-build.compile-test, compile-tools-tests"/>
+  <target name="test" depends="contrib-build.test, test-tools"/>
+
+</project>

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
+
+public class KuromojiAnalyzer extends StopwordAnalyzerBase {
+  private final Segmenter segmenter;
+  private final Set<String> stoptags;
+  
+  public KuromojiAnalyzer(Version matchVersion) {
+    this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+  }
+  
+  public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
+    super(matchVersion, stopwords);
+    this.segmenter = segmenter;
+    this.stoptags = stoptags;
+  }
+  
+  public static Set<?> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  public static Set<String> getDefaultStopTags(){
+    return DefaultSetHolder.DEFAULT_STOP_TAGS;
+  }
+  
+  /**
+   * Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the 
+   * outer class accesses the static final set the first time.
+   */
+  private static class DefaultSetHolder {
+    static final Set<?> DEFAULT_STOP_SET;
+    static final Set<String> DEFAULT_STOP_TAGS;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
+        final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
+        DEFAULT_STOP_TAGS = new HashSet<String>();
+        for (Object element : tagset) {
+          char chars[] = (char[]) element;
+          DEFAULT_STOP_TAGS.add(new String(chars));
+        }
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
+    TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
+    stream = new CJKWidthFilter(stream);
+    stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
+    stream = new StopFilter(matchVersion, stream, stopwords);
+    stream = new KuromojiBaseFormFilter(stream);
+    return new TokenStreamComponents(tokenizer, stream);
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * Replaces term text with the {@link BaseFormAttribute}.
+ * <p>
+ * This acts as a lemmatizer for verbs and adjectives.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class KuromojiBaseFormFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+
+  public KuromojiBaseFormFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword()) {
+        String baseForm = basicFormAtt.getBaseForm();
+        if (baseForm != null) {
+          termAtt.setEmpty().append(basicFormAtt.getBaseForm());
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Removes tokens that match a set of POS tags.
+ */
+public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
+  private final Set<String> stopTags;
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+
+  public KuromojiPartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
+    super(enablePositionIncrements, input);
+    this.stopTags = stopTags;
+  }
+
+  @Override
+  protected boolean accept() throws IOException {
+    final String pos = posAtt.getPartOfSpeech();
+    return pos == null || !stopTags.contains(pos);
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,83 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+
+public final class KuromojiTokenizer extends SegmentingTokenizerBase {
+  private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+  private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
+  private final Segmenter segmenter;
+  
+  private List<Token> tokens; 
+  private int tokenIndex = 0;
+  private int sentenceStart = 0;
+  
+  public KuromojiTokenizer(Reader input) {
+    this(new Segmenter(), input);
+  }
+  
+  public KuromojiTokenizer(Segmenter segmenter, Reader input) {
+    super(input, (BreakIterator) proto.clone());
+    this.segmenter = segmenter;
+  }
+  
+  @Override
+  protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+    this.sentenceStart = sentenceStart;
+    // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
+    tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
+    tokenIndex = 0;
+  }
+
+  @Override
+  protected boolean incrementWord() {
+    if (tokenIndex == tokens.size()) {
+      return false;
+    }
+    Token token = tokens.get(tokenIndex);
+    int position = token.getPosition();
+    int length = token.getLength();
+    clearAttributes();
+    termAtt.copyBuffer(buffer, sentenceStart + position, length);
+    int startOffset = offset + sentenceStart + position;
+    offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
+    basicFormAtt.setToken(token);
+    posAtt.setToken(token);
+    readingAtt.setToken(token);
+    inflectionAtt.setToken(token);
+    tokenIndex++;
+    return true;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,214 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.*;
+import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+/**
+ * Tokenizer main class.
+ * Thread safe.
+ */
+public class Segmenter {
+  public static enum Mode {
+    NORMAL, SEARCH, EXTENDED
+  }
+  
+  private final Viterbi viterbi;
+  
+  private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+  
+  private final boolean split;
+  
+  public Segmenter() {
+    this(null, Mode.NORMAL, false);
+  }
+
+  public Segmenter(UserDictionary userDictionary, Mode mode) {
+    this(userDictionary, mode, false);
+  }
+
+  public Segmenter(UserDictionary userDictionary) {
+    this(userDictionary, Mode.NORMAL, false);
+  }
+
+  public Segmenter(Mode mode) {
+    this(null, mode, false);
+  }
+
+  public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
+    
+    final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
+    final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
+    this.viterbi = new Viterbi(dict,
+        unknownDict,
+        ConnectionCosts.getInstance(),
+        userDictionary,
+        mode);
+    
+    this.split = split;
+    
+    dictionaryMap.put(Type.KNOWN, dict);
+    dictionaryMap.put(Type.UNKNOWN, unknownDict);
+    dictionaryMap.put(Type.USER, userDictionary);
+  }
+  
+  /**
+   * Tokenize input text
+   * @param text
+   * @return list of Token
+   */
+  public List<Token> tokenize(String text) {
+    
+    if (!split) {
+      return doTokenize(0, text);			
+    }
+    
+    List<Integer> splitPositions = getSplitPositions(text);
+    
+    if(splitPositions.size() == 0) {
+      return doTokenize(0, text);
+    }
+    
+    ArrayList<Token> result = new ArrayList<Token>();
+    int offset = 0;
+    for(int position : splitPositions) {
+      result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+      offset = position + 1;
+    }
+    
+    if(offset < text.length()) {
+      result.addAll(doTokenize(offset, text.substring(offset)));
+    }
+    
+    return result;
+  }
+  
+  /**
+   * Split input text at å¥èªç¹, which is ã and ã
+   * @param text
+   * @return list of split position
+   */
+  private List<Integer> getSplitPositions(String text) {
+    ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+    
+    int position = 0;
+    int currentPosition = 0;
+    
+    while(true) {
+      int indexOfMaru = text.indexOf("ã", currentPosition);
+      int indexOfTen = text.indexOf("ã", currentPosition);
+      
+      if(indexOfMaru < 0 || indexOfTen < 0) {
+        position = Math.max(indexOfMaru, indexOfTen);;
+      } else {
+        position = Math.min(indexOfMaru, indexOfTen);				
+      }
+      
+      if(position >= 0) {
+        splitPositions.add(position);
+        currentPosition = position + 1;
+      } else {
+        break;
+      }
+    }
+    
+    return splitPositions;
+  }
+  
+  private List<Token> doTokenize(int offset, String sentence) {
+    char text[] = sentence.toCharArray();
+    return doTokenize(offset, text, 0, text.length, false);
+  }
+  
+  /**
+   * Tokenize input sentence.
+   * @param offset offset of sentence in original input text
+   * @param sentence sentence to tokenize
+   * @return list of Token
+   */
+  public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
+    ArrayList<Token> result = new ArrayList<Token>();
+    
+    ViterbiNode[][][] lattice;
+    try {
+      lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
+    } catch (IOException impossible) {
+      throw new RuntimeException(impossible);
+    }
+    List<ViterbiNode> bestPath = viterbi.search(lattice);
+    for (ViterbiNode node : bestPath) {
+      int wordId = node.getWordId();
+      if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS 
+        continue;
+      } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
+        continue; // Do not emit punctuation
+      }
+      Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
+      result.add(token);
+    }
+    
+    return result;
+  }
+  
+  /** returns a Graphviz String */
+  public String debugTokenize(String text) {
+    ViterbiNode[][][] lattice;
+    try {
+      lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
+    } catch (IOException impossible) {
+      throw new RuntimeException(impossible);
+    }
+    List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+    
+    return new GraphvizFormatter(ConnectionCosts.getInstance())
+      .format(lattice[0], lattice[1], bestPath);
+  }
+  
+  static final boolean isPunctuation(char ch) {
+    switch(Character.getType(ch)) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,147 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class Token {
+  private final Dictionary dictionary;
+  
+  private final int wordId;
+  
+  private final char[] surfaceForm;
+  private final int offset;
+  private final int length;
+  
+  private final int position;
+  
+  private final Type type;
+  
+  public Token(int wordId, char[] surfaceForm, int offset, int length, Type type, int position, Dictionary dictionary) {
+    this.wordId = wordId;
+    this.surfaceForm = surfaceForm;
+    this.offset = offset;
+    this.length = length;
+    this.type = type;
+    this.position = position;
+    this.dictionary = dictionary;
+  }
+  
+  /**
+   * @return surfaceForm
+   */
+  public char[] getSurfaceForm() {
+    return surfaceForm;
+  }
+  
+  /**
+   * @return offset into surfaceForm
+   */
+  public int getOffset() {
+    return offset;
+  }
+  
+  /**
+   * @return length of surfaceForm
+   */
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * @return surfaceForm as a String
+   */
+  public String getSurfaceFormString() {
+    return new String(surfaceForm, offset, length);
+  }
+  
+  /**
+   * @return reading. null if token doesn't have reading.
+   */
+  public String getReading() {
+    return dictionary.getReading(wordId);
+  }
+  
+  /**
+   * @return pronunciation. null if token doesn't have pronunciation.
+   */
+  public String getPronunciation() {
+    return dictionary.getPronunciation(wordId);
+  }
+  
+  /**
+   * @return part of speech.
+   */
+  public String getPartOfSpeech() {
+    return dictionary.getPartOfSpeech(wordId);
+  }
+  
+  /**
+   * @return inflection type or null
+   */
+  public String getInflectionType() {
+    return dictionary.getInflectionType(wordId);
+  }
+  
+  /**
+   * @return inflection form or null
+   */
+  public String getInflectionForm() {
+    return dictionary.getInflectionForm(wordId);
+  }
+  
+  /**
+   * @return base form or null if token is not inflected
+   */
+  public String getBaseForm() {
+    return dictionary.getBaseForm(wordId);
+  }
+  
+  /**
+   * Returns true if this token is known word
+   * @return true if this token is in standard dictionary. false if not.
+   */
+  public boolean isKnown() {
+    return type == Type.KNOWN;
+  }
+  
+  /**
+   * Returns true if this token is unknown word
+   * @return true if this token is unknown word. false if not.
+   */
+  public boolean isUnknown() {
+    return type == Type.UNKNOWN;
+  }
+  
+  /**
+   * Returns true if this token is defined in user dictionary
+   * @return true if this token is in user dictionary. false if not.
+   */
+  public boolean isUser() {
+    return type == Type.USER;
+  }
+  
+  /**
+   * Get index of this token in input text
+   * @return position of token
+   */
+  public int getPosition() {
+    return position;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,291 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IOUtils;
+
+public abstract class BinaryDictionary implements Dictionary {
+  
+  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
+  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+  public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
+  
+  public static final String DICT_HEADER = "kuromoji_dict";
+  public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
+  public static final String POSDICT_HEADER = "kuromoji_dict_pos";
+  public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
+  public static final int VERSION = 1;
+  
+  private final ByteBuffer buffer;
+  private final int[] targetMapOffsets, targetMap;
+  private final String[] posDict;
+  private final String[] inflTypeDict;
+  private final String[] inflFormDict;
+  
+  protected BinaryDictionary() throws IOException {
+    InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
+    IOException priorE = null;
+    int[] targetMapOffsets = null, targetMap = null;
+    String[] posDict = null;
+    String[] inflFormDict = null;
+    String[] inflTypeDict = null;
+    ByteBuffer buffer = null;
+    try {
+      mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
+      mapIS = new BufferedInputStream(mapIS);
+      DataInput in = new InputStreamDataInput(mapIS);
+      CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
+      targetMap = new int[in.readVInt()];
+      targetMapOffsets = new int[in.readVInt()];
+      int accum = 0, sourceId = 0;
+      for (int ofs = 0; ofs < targetMap.length; ofs++) {
+        final int val = in.readVInt();
+        if ((val & 0x01) != 0) {
+          targetMapOffsets[sourceId] = ofs;
+          sourceId++;
+        }
+        accum += val >>> 1;
+        targetMap[ofs] = accum;
+      }
+      if (sourceId + 1 != targetMapOffsets.length)
+        throw new IOException("targetMap file format broken");
+      targetMapOffsets[sourceId] = targetMap.length;
+      mapIS.close(); mapIS = null;
+      
+      posIS = getResource(POSDICT_FILENAME_SUFFIX);
+      posIS = new BufferedInputStream(posIS);
+      in = new InputStreamDataInput(posIS);
+      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
+      posDict = new String[in.readVInt()];
+      for (int j = 0; j < posDict.length; j++) {
+        posDict[j] = in.readString();
+      }
+      posIS.close(); posIS = null;
+      
+      inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
+      inflIS = new BufferedInputStream(inflIS);
+      in = new InputStreamDataInput(inflIS);
+      CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
+      int length = in.readVInt();
+      inflTypeDict = new String[length];
+      inflFormDict = new String[length];
+      for (int j = 0; j < length; j++) {
+        inflTypeDict[j] = in.readString();
+        inflFormDict[j] = in.readString();
+      }
+      inflIS.close(); inflIS = null;
+
+      dictIS = getResource(DICT_FILENAME_SUFFIX);
+      // no buffering here, as we load in one large buffer
+      in = new InputStreamDataInput(dictIS);
+      CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
+      final int size = in.readVInt();
+      final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+      final ReadableByteChannel channel = Channels.newChannel(dictIS);
+      final int read = channel.read(tmpBuffer);
+      if (read != size) {
+        throw new EOFException("Cannot read whole dictionary");
+      }
+      dictIS.close(); dictIS = null;
+      buffer = tmpBuffer.asReadOnlyBuffer();
+    } catch (IOException ioe) {
+      priorE = ioe;
+    } finally {
+      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
+    }
+    
+    this.targetMap = targetMap;
+    this.targetMapOffsets = targetMapOffsets;
+    this.posDict = posDict;
+    this.inflTypeDict = inflTypeDict;
+    this.inflFormDict = inflFormDict;
+    this.buffer = buffer;
+  }
+  
+  protected final InputStream getResource(String suffix) throws IOException {
+    return getClassResource(getClass(), suffix);
+  }
+  
+  // util, reused by ConnectionCosts and CharacterDefinition
+  public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
+    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
+    if (is == null)
+      throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
+    return is;
+  }
+  
+  public void lookupWordIds(int sourceId, IntsRef ref) {
+    ref.ints = targetMap;
+    ref.offset = targetMapOffsets[sourceId];
+    // targetMapOffsets always has one more entry pointing behind last:
+    ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
+  }
+  
+  @Override	
+  public int getLeftId(int wordId) {
+    return buffer.getShort(wordId);
+  }
+  
+  @Override
+  public int getRightId(int wordId) {
+    return buffer.getShort(wordId + 2);	// Skip left id
+  }
+  
+  @Override
+  public int getWordCost(int wordId) {
+    return buffer.getShort(wordId + 4);	// Skip left id and right id
+  }
+
+  @Override
+  public String getBaseForm(int wordId) {
+    int offset = baseFormOffset(wordId);
+    int length = (buffer.get(offset++) & 0xff) >>> 1;
+    if (length == 0) {
+      return null; // same as surface form
+    } else {
+      return readString(offset, length, false);
+    }
+  }
+  
+  @Override
+  public String getReading(int wordId) {
+    int offset = readingOffset(wordId);
+    int readingData = buffer.get(offset++) & 0xff;
+    return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+  }
+  
+  @Override
+  public String getPartOfSpeech(int wordId) {
+    int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
+    return posDict[posIndex >>> 1];
+  }
+  
+  @Override
+  public String getPronunciation(int wordId) {
+    if (hasPronunciationData(wordId)) {
+      int offset = pronunciationOffset(wordId);
+      int pronunciationData = buffer.get(offset++) & 0xff;
+      return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
+    } else {
+      return getReading(wordId); // same as the reading
+    }
+  }
+  
+  @Override
+  public String getInflectionType(int wordId) {
+    int index = getInflectionIndex(wordId);
+    return index < 0 ? null : inflTypeDict[index];
+  }
+
+  @Override
+  public String getInflectionForm(int wordId) {
+    int index = getInflectionIndex(wordId);
+    return index < 0 ? null : inflFormDict[index];
+  }
+  
+  private static int posOffset(int wordId) {
+    return wordId + 6;
+  }
+  
+  private static int baseFormOffset(int wordId) {
+    return wordId + 7;
+  }
+  
+  private int readingOffset(int wordId) {
+    int offset = baseFormOffset(wordId);
+    int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
+    return offset + baseFormLength;
+  }
+  
+  private int pronunciationOffset(int wordId) {
+    int offset = readingOffset(wordId);
+    int readingData = buffer.get(offset++) & 0xff;
+    final int readingLength;
+    if ((readingData & 1) == 0) {
+      readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+    } else {
+      readingLength = readingData >>> 1;
+    }
+    return offset + readingLength;
+  }
+  
+  private boolean hasPronunciationData(int wordId) {
+    int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
+    return (baseFormData & 1) == 0;
+  }
+  
+  private boolean hasInflectionData(int wordId) {
+    int posData = buffer.get(posOffset(wordId)) & 0xff;
+    return (posData & 1) == 1;
+  }
+  
+  private int getInflectionIndex(int wordId) {
+    if (!hasInflectionData(wordId)) {
+      return -1; // common case: no inflection data
+    }
+    
+    // skip past reading/pronunciation at the end
+    int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
+    int endData = buffer.get(offset++) & 0xff;
+    
+    final int endLength;
+    if ((endData & 1) == 0) {
+      endLength = endData & 0xfe; // UTF-16: mask off kana bit
+    } else {
+      endLength = endData >>> 1;
+    }
+    
+    offset += endLength;
+    
+    byte b = buffer.get(offset++);
+    int i = b & 0x7F;
+    if ((b & 0x80) == 0) return i;
+    b = buffer.get(offset++);
+    i |= (b & 0x7F) << 7;
+    assert ((b & 0x80) == 0);
+    return i;
+  }
+  
+  private String readString(int offset, int length, boolean kana) {
+    char text[] = new char[length];
+    if (kana) {
+      for (int i = 0; i < length; i++) {
+        text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
+      }
+    } else {
+      for (int i = 0; i < length; i++) {
+        text[i] = buffer.getChar(offset + (i << 1));
+      }
+    }
+    return new String(text);
+  }
+}