You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 16:40:31 UTC

svn commit: r1230598 - in /lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis: KuromojiBaseFormFilterFactory.java KuromojiPartOfSpeechStopFilterFactory.java KuromojiTokenizerFactory.java

Author: rmuir
Date: Thu Jan 12 15:40:30 2012
New Revision: 1230598

URL: http://svn.apache.org/viewvc?rev=1230598&view=rev
Log:
LUCENE-3305: add factories (TODO: tests) and allow user dictionary to be any encoding

Added:
    lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java   (with props)
    lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java   (with props)
Modified:
    lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java

Added: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java?rev=1230598&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java (added)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java Thu Jan 12 15:40:30 2012
@@ -0,0 +1,40 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiBaseFormFilter;
+
+/**
+ * Factory for {@link KuromojiBaseFormFilter}.  
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.KuromojiBaseFormFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ */
+public class KuromojiBaseFormFilterFactory extends BaseTokenFilterFactory {
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new KuromojiBaseFormFilter(input);
+  }
+}

Added: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java?rev=1230598&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java (added)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java Thu Jan 12 15:40:30 2012
@@ -0,0 +1,65 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Factory for {@link KuromojiPartOfSpeechStopFilter}.  
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.KuromojiPartOfSpeechStopFilterFactory" 
+ *             tags="stopTags.txt" 
+ *             enablePositionIncrements="true"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ */
+public class KuromojiPartOfSpeechStopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware  {
+  private boolean enablePositionIncrements;
+  private Set<String> stopTags;
+
+  public void inform(ResourceLoader loader) {
+    String stopTagFiles = args.get("tags");
+    enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+    try {
+      CharArraySet cas = getWordSet(loader, stopTagFiles, false);
+      stopTags = new HashSet<String>();
+      for (Object element : cas) {
+        char chars[] = (char[]) element;
+        stopTags.add(new String(chars));
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public TokenStream create(TokenStream stream) {
+    return new KuromojiPartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
+  }
+}

Modified: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1230598&r1=1230597&r2=1230598&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Jan 12 15:40:30 2012
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.Locale;
@@ -35,10 +36,27 @@ import org.apache.solr.common.ResourceLo
 import org.apache.solr.common.SolrException;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 
+/**
+ * Factory for {@link KuromojiTokenizer}.  
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ja" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.KuromojiTokenizerFactory"
+ *       mode=NORMAL
+ *       user-dictionary=user.txt
+ *       user-dictionary-encoding=UTF-8
+ *     /&gt;
+ *     &lt;filter class="solr.KuromojiBaseFormFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ */
 public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
   private static final String MODE = "mode";
   
   private static final String USER_DICT_PATH = "user-dictionary";
+  
+  private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
 
   private Segmenter segmenter;
   
@@ -49,8 +67,12 @@ public class KuromojiTokenizerFactory ex
     try {
       if (userDictionaryPath != null) {
         InputStream stream = loader.openResource(userDictionaryPath);
+        String encoding = args.get(USER_DICT_ENCODING);
+        if (encoding == null) {
+          encoding = IOUtils.UTF_8;
+        }
         // note: we could allow for other encodings here as an argument
-        CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+        CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
             .onMalformedInput(CodingErrorAction.REPORT)
             .onUnmappableCharacter(CodingErrorAction.REPORT);
         Reader reader = new InputStreamReader(stream, decoder);