You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:51:20 UTC
svn commit: r1226644 - in /lucene/dev/branches/lucene3305: modules/analysis/
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/
Author: rmuir
Date: Tue Jan 3 04:51:19 2012
New Revision: 1226644
URL: http://svn.apache.org/viewvc?rev=1226644&view=rev
Log:
LUCENE-3305: add lucene integration mostly as-is
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (with props)
lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (with props)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/NOTICE.txt
Modified: lucene/dev/branches/lucene3305/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/NOTICE.txt?rev=1226644&r1=1226643&r2=1226644&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/NOTICE.txt Tue Jan 3 04:51:19 2012
@@ -71,3 +71,86 @@ LGPL and Creative Commons ShareAlike.
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)
+
+===========================================================================
+Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+ mecab-ipadic-2.7.0-20070801
+
+which can be obtained from
+
+ http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
+
+or
+
+ http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
+
+===========================================================================
+mecab-ipadic-2.7.0-20070801 Notice
+===========================================================================
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software. The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis. Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise. The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter. Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program. The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1226644&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Tue Jan 3 04:51:19 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class KuromojiAnalyzer extends Analyzer {
+ org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer;
+
+ public KuromojiAnalyzer(org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer;
+ try {
+ tokenizer = new KuromojiTokenizer(this.tokenizer, reader);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ } catch (IOException e) {
+ // nocommit
+ throw new RuntimeException(e);
+ }
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1226644&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Tue Jan 3 04:51:19 2012
@@ -0,0 +1,87 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+public class KuromojiTokenizer extends Tokenizer {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer;
+
+ private String str;
+
+ private List<Token> tokens;
+
+ private int tokenIndex = 0;
+
+ public KuromojiTokenizer(org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer, Reader input) throws IOException {
+ super(input);
+ this.tokenizer = tokenizer;
+ // nocommit: this won't really work for large docs.
+ // what kind of context does kuromoji need? just sentence maybe?
+ str = IOUtils.toString(input);
+ init();
+ }
+
+ private void init() {
+ tokenIndex = 0;
+ tokens = tokenizer.tokenize(str);
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if(tokenIndex == tokens.size()) {
+ return false;
+ }
+
+ Token token = tokens.get(tokenIndex);
+ String surfaceForm = token.getSurfaceForm();
+ int position = token.getPosition();
+ int length = surfaceForm.length();
+
+ termAtt.setEmpty().append(str, position, length);
+ offsetAtt.setOffset(correctOffset(position), correctOffset(position + length));
+ typeAtt.setType(token.getPartOfSpeech());
+ tokenIndex++;
+ return true;
+ }
+
+ @Override
+ public void end() {
+ final int ofs = correctOffset(str.length());
+ offsetAtt.setOffset(ofs, ofs);
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException{
+ super.reset(input);
+ str = IOUtils.toString(input);
+ init();
+ }
+
+}
Added: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1226644&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (added)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Tue Jan 3 04:51:19 2012
@@ -0,0 +1,58 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.solr.analysis.BaseTokenizerFactory;
+import org.apache.solr.common.SolrException;
+
+public class KuromojiTokenizerFactory extends BaseTokenizerFactory{
+ private static final String MODE = "mode";
+
+ private static final String USER_DICT_PATH = "user-dictionary";
+
+ private org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer;
+
+ @Override
+ public void init(Map<String,String> args) {
+ this.args = args;
+ Mode mode = args.get(MODE) != null ? Mode.valueOf(args.get(MODE).toUpperCase(Locale.ENGLISH)) : Mode.NORMAL;
+ String userDictionaryPath = args.get(USER_DICT_PATH);
+ try {
+ this.tokenizer = org.apache.lucene.analysis.kuromoji.Tokenizer.builder().mode(mode).userDictionary(userDictionaryPath).build();
+ } catch (Exception e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
+ }
+
+ @Override
+ public Tokenizer create(Reader input) {
+ try {
+ return new KuromojiTokenizer(tokenizer, input);
+ } catch (IOException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
+ }
+}
\ No newline at end of file