You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 16:40:31 UTC
svn commit: r1230598 - in
/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis:
KuromojiBaseFormFilterFactory.java
KuromojiPartOfSpeechStopFilterFactory.java KuromojiTokenizerFactory.java
Author: rmuir
Date: Thu Jan 12 15:40:30 2012
New Revision: 1230598
URL: http://svn.apache.org/viewvc?rev=1230598&view=rev
Log:
LUCENE-3305: add factories (TODO: tests) and allow user dictionary to be any encoding
Added:
lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java (with props)
lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java (with props)
Modified:
lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
Added: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java?rev=1230598&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java (added)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiBaseFormFilterFactory.java Thu Jan 12 15:40:30 2012
@@ -0,0 +1,40 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiBaseFormFilter;
+
+/**
+ * Factory for {@link KuromojiBaseFormFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_ja" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ * <filter class="solr.KuromojiBaseFormFilterFactory"/>
+ * </analyzer>
+ * </fieldType>
+ * </pre>
+ */
+public class KuromojiBaseFormFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new KuromojiBaseFormFilter(input);
+ }
+}
Added: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java?rev=1230598&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java (added)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiPartOfSpeechStopFilterFactory.java Thu Jan 12 15:40:30 2012
@@ -0,0 +1,65 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Factory for {@link KuromojiPartOfSpeechStopFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_ja" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ * <filter class="solr.KuromojiPartOfSpeechStopFilterFactory"
+ * tags="stopTags.txt"
+ * enablePositionIncrements="true"/>
+ * </analyzer>
+ * </fieldType>
+ * </pre>
+ */
+public class KuromojiPartOfSpeechStopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+ private boolean enablePositionIncrements;
+ private Set<String> stopTags;
+
+ public void inform(ResourceLoader loader) {
+ String stopTagFiles = args.get("tags");
+ enablePositionIncrements = getBoolean("enablePositionIncrements", false);
+ try {
+ CharArraySet cas = getWordSet(loader, stopTagFiles, false);
+ stopTags = new HashSet<String>();
+ for (Object element : cas) {
+ char chars[] = (char[]) element;
+ stopTags.add(new String(chars));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public TokenStream create(TokenStream stream) {
+ return new KuromojiPartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
+ }
+}
Modified: lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1230598&r1=1230597&r2=1230598&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3305/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Jan 12 15:40:30 2012
@@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Locale;
@@ -35,10 +36,27 @@ import org.apache.solr.common.ResourceLo
import org.apache.solr.common.SolrException;
import org.apache.solr.util.plugin.ResourceLoaderAware;
+/**
+ * Factory for {@link KuromojiTokenizer}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_ja" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.KuromojiTokenizerFactory"
+ * mode=NORMAL
+ * user-dictionary=user.txt
+ * user-dictionary-encoding=UTF-8
+ * />
+ * <filter class="solr.KuromojiBaseFormFilterFactory"/>
+ * </analyzer>
+ * </fieldType>
+ * </pre>
+ */
public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
private static final String MODE = "mode";
private static final String USER_DICT_PATH = "user-dictionary";
+
+ private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
private Segmenter segmenter;
@@ -49,8 +67,12 @@ public class KuromojiTokenizerFactory ex
try {
if (userDictionaryPath != null) {
InputStream stream = loader.openResource(userDictionaryPath);
+ String encoding = args.get(USER_DICT_ENCODING);
+ if (encoding == null) {
+ encoding = IOUtils.UTF_8;
+ }
// note: we could allow for other encodings here as an argument
- CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+ CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);