You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 21:10:52 UTC
svn commit: r1230748 [1/5] - in /lucene/dev/trunk: dev-tools/eclipse/
lucene/contrib/ modules/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/util/
modules/analysis/common/src/test/org/apache/lucene/analysis/util/
modules/analysis...
Author: rmuir
Date: Thu Jan 12 20:10:48 2012
New Revision: 1230748
URL: http://svn.apache.org/viewvc?rev=1230748&view=rev
Log:
LUCENE-3305: add Kuromoji Japanese morphological analyzer
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/build.xml (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/BaseFormAttribute.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/BaseFormAttributeImpl.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttribute.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/InflectionAttributeImpl.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttribute.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/PartOfSpeechAttributeImpl.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttribute.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenattributes/ReadingAttributeImpl.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stoptags.txt (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/stopwords.txt (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/bocchan.utf-8 (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/util/
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/util/TestToStringUtil.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/CharacterDefinitionWriter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java (with props)
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/trunk/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/StringMockSolrResourceLoader.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiBaseFormFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiPartOfSpeechStopFilterFactory.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (with props)
Modified:
lucene/dev/trunk/dev-tools/eclipse/dot.classpath
lucene/dev/trunk/lucene/contrib/contrib-build.xml
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/modules/analysis/NOTICE.txt
lucene/dev/trunk/modules/analysis/README.txt
lucene/dev/trunk/modules/analysis/build.xml
lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
lucene/dev/trunk/solr/contrib/analysis-extras/README.txt
lucene/dev/trunk/solr/contrib/analysis-extras/build.xml
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
Modified: lucene/dev/trunk/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/dev-tools/eclipse/dot.classpath?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/trunk/dev-tools/eclipse/dot.classpath Thu Jan 12 20:10:48 2012
@@ -24,6 +24,9 @@
<classpathentry kind="src" path="modules/analysis/icu/src/java"/>
<classpathentry kind="src" path="modules/analysis/icu/src/resources"/>
<classpathentry kind="src" path="modules/analysis/icu/src/test"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/java"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/resources"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/test"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/java"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/test"/>
<classpathentry kind="src" path="modules/analysis/smartcn/src/java"/>
Modified: lucene/dev/trunk/lucene/contrib/contrib-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/contrib-build.xml?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/contrib-build.xml (original)
+++ lucene/dev/trunk/lucene/contrib/contrib-build.xml Thu Jan 12 20:10:48 2012
@@ -196,6 +196,17 @@
<property name="analyzers-stempel.uptodate" value="true"/>
</target>
+ <property name="analyzers-kuromoji.jar" value="${common.dir}/../modules/analysis/build/kuromoji/lucene-analyzers-kuromoji-${version}.jar"/>
+ <target name="check-analyzers-kuromoji-uptodate" unless="analyzers-kuromoji.uptodate">
+ <module-uptodate name="analysis/kuromoji" jarfile="${analyzers-kuromoji.jar}" property="analyzers-kuromoji.uptodate"/>
+ </target>
+ <target name="jar-analyzers-kuromoji" unless="analyzers-kuromoji.uptodate" depends="check-analyzers-kuromoji-uptodate">
+ <ant dir="${common.dir}/../modules/analysis/kuromoji" target="jar-core" inheritAll="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ </ant>
+ <property name="analyzers-kuromoji.uptodate" value="true"/>
+ </target>
+
<property name="grouping.jar" value="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"/>
<target name="check-grouping-uptodate" unless="grouping.uptodate">
<module-uptodate name="grouping" jarfile="${grouping.jar}" property="grouping.uptodate"/>
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Thu Jan 12 20:10:48 2012
@@ -42,6 +42,11 @@ API Changes
since they prevent reuse. Both Analyzers should be configured at instantiation.
(Chris Male)
+ * LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
+ with BreakIterator and allows subclasses to decompose sentences into words, or
+ use the sentence boundary information for other reasons (e.g. attribute/position increment)
+ (Robert Muir)
+
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
@@ -109,6 +114,9 @@ New Features
* LUCENE-3414: Added HunspellStemFilter which uses a provided pure Java implementation of the
Hunspell algorithm. (Chris Male)
+ * LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
+ (Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Robert Muir)
+
Build
* LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the
Modified: lucene/dev/trunk/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/NOTICE.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/trunk/modules/analysis/NOTICE.txt Thu Jan 12 20:10:48 2012
@@ -71,3 +71,86 @@ LGPL and Creative Commons ShareAlike.
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)
+
+===========================================================================
+Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+ mecab-ipadic-2.7.0-20070801
+
+which can be obtained from
+
+ http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
+
+or
+
+ http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
+
+===========================================================================
+mecab-ipadic-2.7.0-20070801 Notice
+===========================================================================
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software. The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis. Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise. The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter. Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program. The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.
Modified: lucene/dev/trunk/modules/analysis/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/README.txt?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/README.txt (original)
+++ lucene/dev/trunk/modules/analysis/README.txt Thu Jan 12 20:10:48 2012
@@ -22,6 +22,12 @@ lucene-analyzers-icu-XX.jar
International Components for Unicode (ICU). Note: this module depends on
the ICU4j jar file (version >= 4.6.0)
+lucene-analyzers-kuromoji-XX.jar
+ An analyzer with morphological analysis for Japanese.
+
+lucene-analyzers-morfologik-XX.jar
+ An analyzer using the Morfologik stemming library.
+
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
@@ -35,21 +41,20 @@ lucene-analyzers-stempel-XX.jar
An add-on analysis library that contains a universal algorithmic stemmer,
including tables for the Polish language.
-lucene-analyzers-morfologik-XX.jar
- An analyzer using the Morfologik stemming library.
-
common/src/java
icu/src/java
+kuromoji/src/java
+morfologik/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
-morfologik/src/java
- The source code for the ffve libraries.
+ The source code for the libraries.
common/src/test
icu/src/test
+kuromoji/src/test
+morfologik/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
-morfologik/src/test
- Unit tests for the five libraries.
+ Unit tests for the libraries.
Modified: lucene/dev/trunk/modules/analysis/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/build.xml?rev=1230748&r1=1230747&r2=1230748&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/build.xml (original)
+++ lucene/dev/trunk/modules/analysis/build.xml Thu Jan 12 20:10:48 2012
@@ -23,9 +23,10 @@
Additional Analyzers
- common: Additional Analyzers
- icu: Analyzers that use functionality from ICU
+ - kuromoji: Japanese Morphological Analyzer
+ - morfologik: Morfologik Stemmer
- smartcn: Smart Analyzer for Simplified Chinese Text
- stempel: Algorithmic Stemmer for Polish
- - morfologik: Morfologik Stemmer
</description>
<target name="common">
@@ -36,6 +37,14 @@
<ant dir="icu" />
</target>
+ <target name="kuromoji">
+ <ant dir="kuromoji" />
+ </target>
+
+ <target name="morfologik">
+ <ant dir="morfologik" />
+ </target>
+
<target name="phonetic">
<ant dir="phonetic" />
</target>
@@ -48,52 +57,53 @@
<ant dir="stempel" />
</target>
- <target name="morfologik">
- <ant dir="morfologik" />
- </target>
-
<target name="default" depends="compile"/>
- <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
+ <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel" />
<target name="clean">
<ant dir="common" target="clean" />
<ant dir="icu" target="clean" />
+ <ant dir="kuromoji" target="clean"/>
+ <ant dir="morfologik" target="clean" />
<ant dir="phonetic" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
- <ant dir="morfologik" target="clean" />
</target>
<target name="validate">
<ant dir="common" target="validate" />
<ant dir="icu" target="validate" />
+ <ant dir="kuromoji" target="validate" />
+ <ant dir="morfologik" target="validate" />
<ant dir="phonetic" target="validate" />
<ant dir="smartcn" target="validate" />
<ant dir="stempel" target="validate" />
- <ant dir="morfologik" target="validate" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
<ant dir="icu" target="compile-core" />
+ <ant dir="kuromoji" target="compile-core" />
+ <ant dir="morfologik" target="compile-core" />
<ant dir="phonetic" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
- <ant dir="morfologik" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
<ant dir="icu" target="compile-test" />
+ <ant dir="kuromoji" target="compile-test" />
+ <ant dir="morfologik" target="compile-test" />
<ant dir="phonetic" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
- <ant dir="morfologik" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
<ant dir="icu" target="test" />
+ <ant dir="kuromoji" target="test" />
+ <ant dir="morfologik" target="test" />
<ant dir="phonetic" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
- <ant dir="morfologik" target="test" />
</target>
<target name="build-artifacts-and-tests" depends="default,compile-test" />
@@ -101,28 +111,31 @@
<target name="dist-maven" depends="default,javadocs">
<ant dir="common" target="dist-maven" />
<ant dir="icu" target="dist-maven" />
+ <ant dir="kuromoji" target="dist-maven" />
+ <ant dir="morfologik" target="dist-maven" />
<ant dir="phonetic" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
- <ant dir="morfologik" target="dist-maven" />
</target>
<target name="javadocs">
<ant dir="common" target="javadocs" />
<ant dir="icu" target="javadocs" />
+ <ant dir="kuromoji" target="javadocs" />
+ <ant dir="morfologik" target="javadocs" />
<ant dir="phonetic" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
- <ant dir="morfologik" target="javadocs" />
</target>
<target name="javadocs-index.html">
<ant dir="common" target="javadocs-index.html" />
<ant dir="icu" target="javadocs-index.html" />
+ <ant dir="kuromoji" target="javadocs-index.html" />
+ <ant dir="morfologik" target="javadocs-index.html" />
<ant dir="phonetic" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
- <ant dir="morfologik" target="javadocs-index.html" />
</target>
</project>
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import java.text.BreakIterator;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Breaks text into sentences with a {@link BreakIterator} and
+ * allows subclasses to decompose these sentences into words.
+ * <p>
+ * This can be used by subclasses that need sentence context
+ * for tokenization purposes, such as CJK segmenters.
+ * <p>
+ * Additionally it can be used by subclasses that want to mark
+ * sentence boundaries (with a custom attribute, extra token, position
+ * increment, etc) for downstream processing.
+ *
+ * @lucene.experimental
+ */
+public abstract class SegmentingTokenizerBase extends Tokenizer {
+ protected static final int BUFFERMAX = 4096;
+ protected final char buffer[] = new char[BUFFERMAX];
+ /** true length of text in the buffer */
+ private int length = 0;
+ /** length in buffer that can be evaluated safely, up to a safe end point */
+ private int usableLength = 0;
+ /** accumulated offset of previous buffers for this reader, for offsetAtt */
+ protected int offset = 0;
+
+ private final BreakIterator iterator;
+ private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
+
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ /**
+ * Construct a new SegmenterBase from the given Reader, using
+ * the provided BreakIterator for sentence segmentation.
+ * <p>
+ * Note that you should never share BreakIterators across different
+ * TokenStreams, instead a newly created or cloned one should always
+ * be provided to this constructor.
+ */
+ public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
+ super(input);
+ this.iterator = iterator;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (length == 0 || !incrementWord()) {
+ while (!incrementSentence()) {
+ refill();
+ if (length <= 0) // no more bytes to read;
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ wrapper.setText(buffer, 0, 0);
+ iterator.setText(wrapper);
+ length = usableLength = offset = 0;
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ this.input = input;
+ reset();
+ }
+
+ @Override
+ public final void end() throws IOException {
+ final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ /** Returns the last unambiguous break position in the text. */
+ private int findSafeEnd() {
+ for (int i = length - 1; i >= 0; i--)
+ if (isSafeEnd(buffer[i]))
+ return i + 1;
+ return -1;
+ }
+
+ /** For sentence tokenization, these are the unambiguous break positions. */
+ protected boolean isSafeEnd(char ch) {
+ switch(ch) {
+ case 0x000D:
+ case 0x000A:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Refill the buffer, accumulating the offset and setting usableLength to the
+ * last unambiguous break position
+ */
+ private void refill() throws IOException {
+ offset += usableLength;
+ int leftover = length - usableLength;
+ System.arraycopy(buffer, usableLength, buffer, 0, leftover);
+ int requested = buffer.length - leftover;
+ int returned = input.read(buffer, leftover, requested);
+ length = returned < 0 ? leftover : returned + leftover;
+ if (returned < requested) /* reader has been emptied, process the rest */
+ usableLength = length;
+ else { /* still more data to be read, find a safe-stopping place */
+ usableLength = findSafeEnd();
+ if (usableLength < 0)
+ usableLength = length; /*
+ * more than IOBUFFER of text without breaks,
+ * gonna possibly truncate tokens
+ */
+ }
+
+ wrapper.setText(buffer, 0, Math.max(0, usableLength));
+ iterator.setText(wrapper);
+ }
+
+ /**
+ * return true if there is a token from the buffer, or null if it is
+ * exhausted.
+ */
+ private boolean incrementSentence() throws IOException {
+ if (length == 0) // we must refill the buffer
+ return false;
+
+ while (true) {
+ int start = iterator.current();
+
+ if (start == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ // find the next set of boundaries
+ int end = iterator.next();
+
+ if (end == BreakIterator.DONE)
+ return false; // BreakIterator exhausted
+
+ setNextSentence(start, end);
+ if (incrementWord()) {
+ return true;
+ }
+ }
+ }
+
+ /** Provides the next input sentence for analysis */
+ protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
+ /** Returns true if another word is available */
+ protected abstract boolean incrementWord();
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,224 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.BreakIterator;
+import java.util.Arrays;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/** Basic tests for {@link SegmentingTokenizerBase} */
+public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
+ private Analyzer sentence = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ private Analyzer sentenceAndWord = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ /** Some simple examples, just outputting the whole sentence boundaries as "terms" */
+ public void testBasics() throws IOException {
+ assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
+ new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
+ );
+ assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
+ new String[] { "He said, \"Are you going?\" ",
+ "John shook his head." }
+ );
+ }
+
+ /** Test a subclass that sets some custom attribute values */
+ public void testCustomAttributes() throws IOException {
+ assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
+ new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
+ new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
+ new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
+ new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }
+ );
+ }
+
+ /** Tests tokenstream reuse */
+ public void testReuse() throws IOException {
+ assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
+ new String[] { "He", "said", "Are", "you", "going" },
+ new int[] { 0, 3, 10, 14, 18 },
+ new int[] { 2, 7, 13, 17, 23 },
+ new int[] { 1, 1, 1, 1, 1,}
+ );
+ assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
+ new String[] { "John", "shook", "his", "head" },
+ new int[] { 0, 5, 11, 15 },
+ new int[] { 4, 10, 14, 19 },
+ new int[] { 1, 1, 1, 1 }
+ );
+ }
+
+ /** Tests TokenStream.end() */
+ public void testEnd() throws IOException {
+ // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
+ // we add some junk whitespace to the end just to test it.
+ assertAnalyzesTo(sentenceAndWord, "John shook his head ",
+ new String[] { "John", "shook", "his", "head" }
+ );
+ assertAnalyzesTo(sentenceAndWord, "John shook his head. ",
+ new String[] { "John", "shook", "his", "head" }
+ );
+ }
+
+ /** Tests terms which span across boundaries */
+ public void testHugeDoc() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ char whitespace[] = new char[4094];
+ Arrays.fill(whitespace, '\n');
+ sb.append(whitespace);
+ sb.append("testing 1234");
+ String input = sb.toString();
+ assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
+ }
+
+ /** Tests the handling of binary/malformed data */
+ public void testHugeTerm() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 40960; i++) {
+ sb.append('a');
+ }
+ String input = sb.toString();
+ char token[] = new char[4096];
+ Arrays.fill(token, 'a');
+ String expectedToken = new String(token);
+ String expected[] = {
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken, expectedToken, expectedToken,
+ expectedToken
+ };
+ assertAnalyzesTo(sentence, input, expected);
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
+ }
+
+ // some tokenizers for testing
+
+ /** silly tokenizer that just returns whole sentences as tokens */
+ static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
+ int sentenceStart, sentenceEnd;
+ boolean hasSentence;
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ public WholeSentenceTokenizer(Reader input) {
+ super(input, BreakIterator.getSentenceInstance(new Locale("")));
+ }
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ hasSentence = true;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ if (hasSentence) {
+ hasSentence = false;
+ clearAttributes();
+ termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
+ offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * simple tokenizer, that bumps posinc + 1 for tokens after a
+ * sentence boundary to inhibit phrase queries without slop.
+ */
+ static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
+ int sentenceStart, sentenceEnd;
+ int wordStart, wordEnd;
+ int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
+
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ public SentenceAndWordTokenizer(Reader input) {
+ super(input, BreakIterator.getSentenceInstance(new Locale("")));
+ }
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ posBoost++;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ posBoost = -1;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ wordStart = wordEnd;
+ while (wordStart < sentenceEnd) {
+ if (Character.isLetterOrDigit(buffer[wordStart]))
+ break;
+ wordStart++;
+ }
+
+ if (wordStart == sentenceEnd) return false;
+
+ wordEnd = wordStart+1;
+ while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
+ wordEnd++;
+
+ clearAttributes();
+ termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
+ offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+ posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
+ posBoost = 0;
+ return true;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/build.xml?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/build.xml (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/build.xml Thu Jan 12 20:10:48 2012
@@ -0,0 +1,121 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="analyzers-kuromoji" default="default">
+
+ <description>
+ Kuromoji Japanese Morphological Analyzer
+ </description>
+
+ <property name="build.dir" location="../build/kuromoji" />
+ <property name="dist.dir" location="../dist/kuromoji" />
+ <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
+ <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
+ <!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
+ <property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
+ <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
+ <property name="dict.encoding" value="euc-jp"/>
+ <property name="dict.format" value="ipadic"/>
+ <property name="dict.normalize" value="false"/>
+ <property name="dict.target.dir" location="./src/resources"/>
+ <import file="../../../lucene/contrib/contrib-build.xml"/>
+
+ <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
+
+ <path id="classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <path refid="base.classpath"/>
+ </path>
+
+ <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+ <target name="download-dict" unless="dict.available">
+ <get src="${dict.url}" dest="${build.dir}/${dict.src.file}"/>
+ <gunzip src="${build.dir}/${dict.src.file}"/>
+ <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
+ </target>
+
+ <path id="tools.dependencies">
+ <fileset dir="../icu/lib" includes="icu4j-*.jar"/>
+ </path>
+
+ <path id="tools.classpath">
+ <path refid="classpath"/>
+ <path refid="tools.dependencies"/>
+ <pathelement location="${build.dir}/classes/java"/>
+ <pathelement location="${build.dir}/classes/tools"/>
+ </path>
+
+ <path id="tools.test.classpath">
+ <path refid="tools.classpath"/>
+ <path refid="test.base.classpath"/>
+ <pathelement location="${build.dir}/classes/tools-test"/>
+ </path>
+
+ <target name="build-dict" depends="compile-tools, download-dict">
+ <sequential>
+ <delete verbose="true">
+ <fileset dir="src/resources/org/apache/lucene/analysis/kuromoji/dict" includes="**/*"/>
+ </delete>
+ <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
+ <classpath>
+ <path refid="tools.classpath"/>
+ <pathelement path="${build.dir}/classes/tools"/>
+ </classpath>
+ <assertions>
+ <enable package="org.apache.lucene"/>
+ </assertions>
+ <arg value="${dict.format}"/>
+ <arg value="${dict.src.dir}"/>
+ <arg value="${dict.target.dir}"/>
+ <arg value="${dict.encoding}"/>
+ <arg value="${dict.normalize}"/>
+ </java>
+ </sequential>
+ </target>
+
+ <target name="compile-tools" depends="compile-core, common.compile-tools">
+ <compile
+ srcdir="src/tools/java"
+ destdir="${build.dir}/classes/tools">
+ <classpath>
+ <path refid="tools.classpath"/>
+ <pathelement path="src/tools/java"/>
+ </classpath>
+ </compile>
+ </target>
+
+ <target name="compile-tools-tests" depends="compile-tools">
+ <compile
+ srcdir="src/tools/test"
+ destdir="${build.dir}/classes/tools-test">
+ <classpath>
+ <path refid="tools.test.classpath"/>
+ <pathelement path="src/tools/test"/>
+ </classpath>
+ </compile>
+ </target>
+
+ <target name="test-tools" depends="compile-tools-tests">
+ <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+ </target>
+
+ <target name="compile-test" depends="contrib-build.compile-test, compile-tools-tests"/>
+ <target name="test" depends="contrib-build.test, test-tools"/>
+
+</project>
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
+
+public class KuromojiAnalyzer extends StopwordAnalyzerBase {
+ private final Segmenter segmenter;
+ private final Set<String> stoptags;
+
+ public KuromojiAnalyzer(Version matchVersion) {
+ this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+ }
+
+ public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
+ super(matchVersion, stopwords);
+ this.segmenter = segmenter;
+ this.stoptags = stoptags;
+ }
+
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ public static Set<String> getDefaultStopTags(){
+ return DefaultSetHolder.DEFAULT_STOP_TAGS;
+ }
+
+ /**
+ * Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the
+ * outer class accesses the static final set the first time.
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+ static final Set<String> DEFAULT_STOP_TAGS;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
+ final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
+ DEFAULT_STOP_TAGS = new HashSet<String>();
+ for (Object element : tagset) {
+ char chars[] = (char[]) element;
+ DEFAULT_STOP_TAGS.add(new String(chars));
+ }
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
+ TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
+ stream = new CJKWidthFilter(stream);
+ stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
+ stream = new StopFilter(matchVersion, stream, stopwords);
+ stream = new KuromojiBaseFormFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiBaseFormFilter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,62 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * Replaces term text with the {@link BaseFormAttribute}.
+ * <p>
+ * This acts as a lemmatizer for verbs and adjectives.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class KuromojiBaseFormFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+
+ public KuromojiBaseFormFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAtt.isKeyword()) {
+ String baseForm = basicFormAtt.getBaseForm();
+ if (baseForm != null) {
+ termAtt.setEmpty().append(basicFormAtt.getBaseForm());
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiPartOfSpeechStopFilter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Removes tokens that match a set of POS tags.
+ */
+public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
+ private final Set<String> stopTags;
+ private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+
+ public KuromojiPartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
+ super(enablePositionIncrements, input);
+ this.stopTags = stopTags;
+ }
+
+ @Override
+ protected boolean accept() throws IOException {
+ final String pos = posAtt.getPartOfSpeech();
+ return pos == null || !stopTags.contains(pos);
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,83 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.text.BreakIterator;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+
+public final class KuromojiTokenizer extends SegmentingTokenizerBase {
+ private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
+ private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+ private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+ private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
+ private final Segmenter segmenter;
+
+ private List<Token> tokens;
+ private int tokenIndex = 0;
+ private int sentenceStart = 0;
+
+ public KuromojiTokenizer(Reader input) {
+ this(new Segmenter(), input);
+ }
+
+ public KuromojiTokenizer(Segmenter segmenter, Reader input) {
+ super(input, (BreakIterator) proto.clone());
+ this.segmenter = segmenter;
+ }
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.sentenceStart = sentenceStart;
+ // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
+ tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
+ tokenIndex = 0;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ if (tokenIndex == tokens.size()) {
+ return false;
+ }
+ Token token = tokens.get(tokenIndex);
+ int position = token.getPosition();
+ int length = token.getLength();
+ clearAttributes();
+ termAtt.copyBuffer(buffer, sentenceStart + position, length);
+ int startOffset = offset + sentenceStart + position;
+ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
+ basicFormAtt.setToken(token);
+ posAtt.setToken(token);
+ readingAtt.setToken(token);
+ inflectionAtt.setToken(token);
+ tokenIndex++;
+ return true;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,214 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.*;
+import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+/**
+ * Tokenizer main class.
+ * Thread safe.
+ */
+public class Segmenter {
+ public static enum Mode {
+ NORMAL, SEARCH, EXTENDED
+ }
+
+ private final Viterbi viterbi;
+
+ private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+
+ private final boolean split;
+
+ public Segmenter() {
+ this(null, Mode.NORMAL, false);
+ }
+
+ public Segmenter(UserDictionary userDictionary, Mode mode) {
+ this(userDictionary, mode, false);
+ }
+
+ public Segmenter(UserDictionary userDictionary) {
+ this(userDictionary, Mode.NORMAL, false);
+ }
+
+ public Segmenter(Mode mode) {
+ this(null, mode, false);
+ }
+
+ public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
+
+ final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
+ final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
+ this.viterbi = new Viterbi(dict,
+ unknownDict,
+ ConnectionCosts.getInstance(),
+ userDictionary,
+ mode);
+
+ this.split = split;
+
+ dictionaryMap.put(Type.KNOWN, dict);
+ dictionaryMap.put(Type.UNKNOWN, unknownDict);
+ dictionaryMap.put(Type.USER, userDictionary);
+ }
+
+ /**
+ * Tokenize input text
+ * @param text
+ * @return list of Token
+ */
+ public List<Token> tokenize(String text) {
+
+ if (!split) {
+ return doTokenize(0, text);
+ }
+
+ List<Integer> splitPositions = getSplitPositions(text);
+
+ if(splitPositions.size() == 0) {
+ return doTokenize(0, text);
+ }
+
+ ArrayList<Token> result = new ArrayList<Token>();
+ int offset = 0;
+ for(int position : splitPositions) {
+ result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+ offset = position + 1;
+ }
+
+ if(offset < text.length()) {
+ result.addAll(doTokenize(offset, text.substring(offset)));
+ }
+
+ return result;
+ }
+
+ /**
+ * Split input text at å¥èªç¹, which is ã and ã
+ * @param text
+ * @return list of split position
+ */
+ private List<Integer> getSplitPositions(String text) {
+ ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+
+ int position = 0;
+ int currentPosition = 0;
+
+ while(true) {
+ int indexOfMaru = text.indexOf("ã", currentPosition);
+ int indexOfTen = text.indexOf("ã", currentPosition);
+
+ if(indexOfMaru < 0 || indexOfTen < 0) {
+ position = Math.max(indexOfMaru, indexOfTen);;
+ } else {
+ position = Math.min(indexOfMaru, indexOfTen);
+ }
+
+ if(position >= 0) {
+ splitPositions.add(position);
+ currentPosition = position + 1;
+ } else {
+ break;
+ }
+ }
+
+ return splitPositions;
+ }
+
+ private List<Token> doTokenize(int offset, String sentence) {
+ char text[] = sentence.toCharArray();
+ return doTokenize(offset, text, 0, text.length, false);
+ }
+
+ /**
+ * Tokenize input sentence.
+ * @param offset offset of sentence in original input text
+ * @param sentence sentence to tokenize
+ * @return list of Token
+ */
+ public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
+ ArrayList<Token> result = new ArrayList<Token>();
+
+ ViterbiNode[][][] lattice;
+ try {
+ lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
+ } catch (IOException impossible) {
+ throw new RuntimeException(impossible);
+ }
+ List<ViterbiNode> bestPath = viterbi.search(lattice);
+ for (ViterbiNode node : bestPath) {
+ int wordId = node.getWordId();
+ if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
+ continue;
+ } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
+ continue; // Do not emit punctuation
+ }
+ Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
+ result.add(token);
+ }
+
+ return result;
+ }
+
+ /** returns a Graphviz String */
+ public String debugTokenize(String text) {
+ ViterbiNode[][][] lattice;
+ try {
+ lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
+ } catch (IOException impossible) {
+ throw new RuntimeException(impossible);
+ }
+ List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+
+ return new GraphvizFormatter(ConnectionCosts.getInstance())
+ .format(lattice[0], lattice[1], bestPath);
+ }
+
+ static final boolean isPunctuation(char ch) {
+ switch(Character.getType(ch)) {
+ case Character.SPACE_SEPARATOR:
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ case Character.CONTROL:
+ case Character.FORMAT:
+ case Character.DASH_PUNCTUATION:
+ case Character.START_PUNCTUATION:
+ case Character.END_PUNCTUATION:
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.OTHER_PUNCTUATION:
+ case Character.MATH_SYMBOL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.OTHER_SYMBOL:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ return true;
+ default:
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,147 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class Token {
+ private final Dictionary dictionary;
+
+ private final int wordId;
+
+ private final char[] surfaceForm;
+ private final int offset;
+ private final int length;
+
+ private final int position;
+
+ private final Type type;
+
+ public Token(int wordId, char[] surfaceForm, int offset, int length, Type type, int position, Dictionary dictionary) {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.offset = offset;
+ this.length = length;
+ this.type = type;
+ this.position = position;
+ this.dictionary = dictionary;
+ }
+
+ /**
+ * @return surfaceForm
+ */
+ public char[] getSurfaceForm() {
+ return surfaceForm;
+ }
+
+ /**
+ * @return offset into surfaceForm
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * @return length of surfaceForm
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * @return surfaceForm as a String
+ */
+ public String getSurfaceFormString() {
+ return new String(surfaceForm, offset, length);
+ }
+
+ /**
+ * @return reading. null if token doesn't have reading.
+ */
+ public String getReading() {
+ return dictionary.getReading(wordId);
+ }
+
+ /**
+ * @return pronunciation. null if token doesn't have pronunciation.
+ */
+ public String getPronunciation() {
+ return dictionary.getPronunciation(wordId);
+ }
+
+ /**
+ * @return part of speech.
+ */
+ public String getPartOfSpeech() {
+ return dictionary.getPartOfSpeech(wordId);
+ }
+
+ /**
+ * @return inflection type or null
+ */
+ public String getInflectionType() {
+ return dictionary.getInflectionType(wordId);
+ }
+
+ /**
+ * @return inflection form or null
+ */
+ public String getInflectionForm() {
+ return dictionary.getInflectionForm(wordId);
+ }
+
+ /**
+ * @return base form or null if token is not inflected
+ */
+ public String getBaseForm() {
+ return dictionary.getBaseForm(wordId);
+ }
+
+ /**
+ * Returns true if this token is known word
+ * @return true if this token is in standard dictionary. false if not.
+ */
+ public boolean isKnown() {
+ return type == Type.KNOWN;
+ }
+
+ /**
+ * Returns true if this token is unknown word
+ * @return true if this token is unknown word. false if not.
+ */
+ public boolean isUnknown() {
+ return type == Type.UNKNOWN;
+ }
+
+ /**
+ * Returns true if this token is defined in user dictionary
+ * @return true if this token is in user dictionary. false if not.
+ */
+ public boolean isUser() {
+ return type == Type.USER;
+ }
+
+ /**
+ * Get index of this token in input text
+ * @return position of token
+ */
+ public int getPosition() {
+ return position;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,291 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IOUtils;
+
+public abstract class BinaryDictionary implements Dictionary {
+
+ public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
+ public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+ public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+ public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
+
+ public static final String DICT_HEADER = "kuromoji_dict";
+ public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
+ public static final String POSDICT_HEADER = "kuromoji_dict_pos";
+ public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
+ public static final int VERSION = 1;
+
+ private final ByteBuffer buffer;
+ private final int[] targetMapOffsets, targetMap;
+ private final String[] posDict;
+ private final String[] inflTypeDict;
+ private final String[] inflFormDict;
+
+ protected BinaryDictionary() throws IOException {
+ InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
+ IOException priorE = null;
+ int[] targetMapOffsets = null, targetMap = null;
+ String[] posDict = null;
+ String[] inflFormDict = null;
+ String[] inflTypeDict = null;
+ ByteBuffer buffer = null;
+ try {
+ mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
+ mapIS = new BufferedInputStream(mapIS);
+ DataInput in = new InputStreamDataInput(mapIS);
+ CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
+ targetMap = new int[in.readVInt()];
+ targetMapOffsets = new int[in.readVInt()];
+ int accum = 0, sourceId = 0;
+ for (int ofs = 0; ofs < targetMap.length; ofs++) {
+ final int val = in.readVInt();
+ if ((val & 0x01) != 0) {
+ targetMapOffsets[sourceId] = ofs;
+ sourceId++;
+ }
+ accum += val >>> 1;
+ targetMap[ofs] = accum;
+ }
+ if (sourceId + 1 != targetMapOffsets.length)
+ throw new IOException("targetMap file format broken");
+ targetMapOffsets[sourceId] = targetMap.length;
+ mapIS.close(); mapIS = null;
+
+ posIS = getResource(POSDICT_FILENAME_SUFFIX);
+ posIS = new BufferedInputStream(posIS);
+ in = new InputStreamDataInput(posIS);
+ CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
+ posDict = new String[in.readVInt()];
+ for (int j = 0; j < posDict.length; j++) {
+ posDict[j] = in.readString();
+ }
+ posIS.close(); posIS = null;
+
+ inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
+ inflIS = new BufferedInputStream(inflIS);
+ in = new InputStreamDataInput(inflIS);
+ CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
+ int length = in.readVInt();
+ inflTypeDict = new String[length];
+ inflFormDict = new String[length];
+ for (int j = 0; j < length; j++) {
+ inflTypeDict[j] = in.readString();
+ inflFormDict[j] = in.readString();
+ }
+ inflIS.close(); inflIS = null;
+
+ dictIS = getResource(DICT_FILENAME_SUFFIX);
+ // no buffering here, as we load in one large buffer
+ in = new InputStreamDataInput(dictIS);
+ CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
+ final int size = in.readVInt();
+ final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+ final ReadableByteChannel channel = Channels.newChannel(dictIS);
+ final int read = channel.read(tmpBuffer);
+ if (read != size) {
+ throw new EOFException("Cannot read whole dictionary");
+ }
+ dictIS.close(); dictIS = null;
+ buffer = tmpBuffer.asReadOnlyBuffer();
+ } catch (IOException ioe) {
+ priorE = ioe;
+ } finally {
+ IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
+ }
+
+ this.targetMap = targetMap;
+ this.targetMapOffsets = targetMapOffsets;
+ this.posDict = posDict;
+ this.inflTypeDict = inflTypeDict;
+ this.inflFormDict = inflFormDict;
+ this.buffer = buffer;
+ }
+
+ protected final InputStream getResource(String suffix) throws IOException {
+ return getClassResource(getClass(), suffix);
+ }
+
+ // util, reused by ConnectionCosts and CharacterDefinition
+ public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
+ final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
+ if (is == null)
+ throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
+ return is;
+ }
+
+ public void lookupWordIds(int sourceId, IntsRef ref) {
+ ref.ints = targetMap;
+ ref.offset = targetMapOffsets[sourceId];
+ // targetMapOffsets always has one more entry pointing behind last:
+ ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
+ }
+
+ @Override
+ public int getLeftId(int wordId) {
+ return buffer.getShort(wordId);
+ }
+
+ @Override
+ public int getRightId(int wordId) {
+ return buffer.getShort(wordId + 2); // Skip left id
+ }
+
+ @Override
+ public int getWordCost(int wordId) {
+ return buffer.getShort(wordId + 4); // Skip left id and right id
+ }
+
+ @Override
+ public String getBaseForm(int wordId) {
+ int offset = baseFormOffset(wordId);
+ int length = (buffer.get(offset++) & 0xff) >>> 1;
+ if (length == 0) {
+ return null; // same as surface form
+ } else {
+ return readString(offset, length, false);
+ }
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ int offset = readingOffset(wordId);
+ int readingData = buffer.get(offset++) & 0xff;
+ return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+ }
+
+ @Override
+ public String getPartOfSpeech(int wordId) {
+ int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
+ return posDict[posIndex >>> 1];
+ }
+
+ @Override
+ public String getPronunciation(int wordId) {
+ if (hasPronunciationData(wordId)) {
+ int offset = pronunciationOffset(wordId);
+ int pronunciationData = buffer.get(offset++) & 0xff;
+ return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
+ } else {
+ return getReading(wordId); // same as the reading
+ }
+ }
+
+ @Override
+ public String getInflectionType(int wordId) {
+ int index = getInflectionIndex(wordId);
+ return index < 0 ? null : inflTypeDict[index];
+ }
+
+ @Override
+ public String getInflectionForm(int wordId) {
+ int index = getInflectionIndex(wordId);
+ return index < 0 ? null : inflFormDict[index];
+ }
+
+ private static int posOffset(int wordId) {
+ return wordId + 6;
+ }
+
+ private static int baseFormOffset(int wordId) {
+ return wordId + 7;
+ }
+
+ private int readingOffset(int wordId) {
+ int offset = baseFormOffset(wordId);
+ int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
+ return offset + baseFormLength;
+ }
+
+ private int pronunciationOffset(int wordId) {
+ int offset = readingOffset(wordId);
+ int readingData = buffer.get(offset++) & 0xff;
+ final int readingLength;
+ if ((readingData & 1) == 0) {
+ readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+ } else {
+ readingLength = readingData >>> 1;
+ }
+ return offset + readingLength;
+ }
+
+ private boolean hasPronunciationData(int wordId) {
+ int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
+ return (baseFormData & 1) == 0;
+ }
+
+ private boolean hasInflectionData(int wordId) {
+ int posData = buffer.get(posOffset(wordId)) & 0xff;
+ return (posData & 1) == 1;
+ }
+
+ private int getInflectionIndex(int wordId) {
+ if (!hasInflectionData(wordId)) {
+ return -1; // common case: no inflection data
+ }
+
+ // skip past reading/pronunciation at the end
+ int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
+ int endData = buffer.get(offset++) & 0xff;
+
+ final int endLength;
+ if ((endData & 1) == 0) {
+ endLength = endData & 0xfe; // UTF-16: mask off kana bit
+ } else {
+ endLength = endData >>> 1;
+ }
+
+ offset += endLength;
+
+ byte b = buffer.get(offset++);
+ int i = b & 0x7F;
+ if ((b & 0x80) == 0) return i;
+ b = buffer.get(offset++);
+ i |= (b & 0x7F) << 7;
+ assert ((b & 0x80) == 0);
+ return i;
+ }
+
+ private String readString(int offset, int length, boolean kana) {
+ char text[] = new char[length];
+ if (kana) {
+ for (int i = 0; i < length; i++) {
+ text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
+ }
+ } else {
+ for (int i = 0; i < length; i++) {
+ text[i] = buffer.getChar(offset + (i << 1));
+ }
+ }
+ return new String(text);
+ }
+}