You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/11/11 18:28:36 UTC
svn commit: r1638220 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/sr/
analysis/common/src/resources/META-INF/services/
analysis/common/src/test/org/apache/lucene/analysis/sr/
Author: mikemccand
Date: Tue Nov 11 17:28:35 2014
New Revision: 1638220
URL: http://svn.apache.org/r1638220
Log:
LUCENE-6053: add Serbian analyzer
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1638220&r1=1638219&r2=1638220&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 11 17:28:35 2014
@@ -77,6 +77,8 @@ New Features
improved exception handling, and indirect norms encoding for sparse fields.
(Mike McCandless, Ryan Ernst, Robert Muir)
+* LUCENE-6053: Add Serbian analyzer. (Nikola Smolenski via Robert Muir, Mike McCandless)
+
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java?rev=1638220&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java Tue Nov 11 17:28:35 2014
@@ -0,0 +1,174 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes Serbian Cyrillic and Latin characters to "bald" Latin.
+ *
+ * Cyrillic characters are first converted to Latin; then, Latin characters
+ * have their diacritics removed, with the exception of Ä which is converted to
+ * dj.
+ *
+ * Note that it expects lowercased input.
+ */
+public final class SerbianNormalizationFilter extends TokenFilter {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public SerbianNormalizationFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char buffer[] = termAtt.buffer();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++) {
+ final char c = buffer[i];
+ switch(c) {
+ case 'а':
+ buffer[i] = 'a';
+ break;
+ case 'б':
+ buffer[i] = 'b';
+ break;
+ case 'в':
+ buffer[i] = 'v';
+ break;
+ case 'г':
+ buffer[i] = 'g';
+ break;
+ case 'д':
+ buffer[i] = 'd';
+ break;
+ case 'Ñ':
+ case 'Ä':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'd';
+ buffer[++i] = 'j';
+ length++;
+ break;
+ case 'е':
+ buffer[i] = 'e';
+ break;
+ case 'ж':
+ case 'з':
+ case 'ž':
+ buffer[i] = 'z';
+ break;
+ case 'и':
+ buffer[i] = 'i';
+ break;
+ case 'Ñ':
+ buffer[i] = 'j';
+ break;
+ case 'к':
+ buffer[i] = 'k';
+ break;
+ case 'л':
+ buffer[i] = 'l';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'l';
+ buffer[++i] = 'j';
+ length++;
+ break;
+ case 'м':
+ buffer[i] = 'm';
+ break;
+ case 'н':
+ buffer[i] = 'n';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'n';
+ buffer[++i] = 'j';
+ length++;
+ break;
+ case 'о':
+ buffer[i] = 'o';
+ break;
+ case 'п':
+ buffer[i] = 'p';
+ break;
+ case 'Ñ':
+ buffer[i] = 'r';
+ break;
+ case 'Ñ':
+ buffer[i] = 's';
+ break;
+ case 'Ñ':
+ buffer[i] = 't';
+ break;
+ case 'Ñ':
+ case 'Ñ':
+ case 'Ñ':
+ case 'Ä':
+ case 'Ä':
+ buffer[i] = 'c';
+ break;
+ case 'Ñ':
+ buffer[i] = 'u';
+ break;
+ case 'Ñ':
+ buffer[i] = 'f';
+ break;
+ case 'Ñ
':
+ buffer[i] = 'h';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'd';
+ buffer[++i] = 'z';
+ length++;
+ break;
+ case 'Ñ':
+ case 'Å¡':
+ buffer[i] = 's';
+ break;
+ default:
+ break;
+ }
+ }
+ termAtt.setLength(length);
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java?rev=1638220&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java Tue Nov 11 17:28:35 2014
@@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link SerbianNormalizationFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_srnorm" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter class="solr.SerbianNormalizationFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+ /** Creates a new SerbianNormalizationFilterFactory */
+ public SerbianNormalizationFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new SerbianNormalizationFilter(input);
+ }
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return this;
+ }
+
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html?rev=1638220&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html Tue Nov 11 17:28:35 2014
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Serbian.
+</body>
+</html>
Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1638220&r1=1638219&r2=1638220&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Tue Nov 11 17:28:35 2014
@@ -91,6 +91,7 @@ org.apache.lucene.analysis.reverse.Rever
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
+org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory
org.apache.lucene.analysis.standard.StandardFilterFactory
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java?rev=1638220&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java Tue Nov 11 17:28:35 2014
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests {@link SerbianNormalizationFilter}
+ */
+public class TestSerbianNormalizationFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String field) {
+ final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new SerbianNormalizationFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ /**
+ * Tests Cyrillic text.
+ */
+ public void testCyrillic() throws IOException {
+ checkOneTerm(analyzer, "абвгдÑежзиÑклÑмнÑопÑÑÑÑÑÑÑ
ÑÑÑÑ", "abvgddjezzijklljmnnjoprstcufhccdzs");
+ }
+
+ /**
+ * Tests Latin text.
+ */
+ public void testLatin() throws IOException {
+ checkOneTerm(analyzer, "abcÄÄddžÄefghijklljmnnjoprsÅ¡tuvzž", "abcccddzdjefghijklljmnnjoprsstuvzz");
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new SerbianNormalizationFilter(tokenizer));
+ }
+ };
+ checkOneTerm(a, "", "");
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java?rev=1638220&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java Tue Nov 11 17:28:35 2014
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the Serbian normalization factory is working.
+ */
+public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("Äura");
+ TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ((Tokenizer)stream).setReader(reader);
+ stream = tokenFilterFactory("SerbianNormalization").create(stream);
+ assertTokenStreamContents(stream, new String[] { "djura" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("SerbianNormalization", "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}