You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/11/10 19:45:42 UTC
svn commit: r1713712 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/sr/
analysis/common/src/test/org/apache/lucene/analysis/sr/
Author: dweiss
Date: Tue Nov 10 18:45:42 2015
New Revision: 1713712
URL: http://svn.apache.org/viewvc?rev=1713712&view=rev
Log:
LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, Dawid Weiss)
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 10 18:45:42 2015
@@ -96,6 +96,9 @@ Changes in Runtime Behavior
New Features
+* LUCENE-6875: New Serbian normalization filter. (Nikola Smolenski via
+ Robert Muir, Dawid Weiss)
+
* LUCENE-6720: New FunctionRangeQuery wrapper around ValueSourceScorer
(returned from ValueSource/FunctionValues.getRangeScorer()). (David Smiley)
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java Tue Nov 10 18:45:42 2015
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.sr;
* limitations under the License.
*/
+import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
@@ -31,15 +32,19 @@ import org.apache.lucene.analysis.util.T
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.SerbianNormalizationFilterFactory"/>
+ * <filter class="solr.SerbianNormalizationFilterFactory"
+ * haircut="bald"/>
* </analyzer>
* </fieldType></pre>
*/
public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+ final String haircut;
/** Creates a new SerbianNormalizationFilterFactory */
public SerbianNormalizationFilterFactory(Map<String,String> args) {
super(args);
+
+ this.haircut = get(args, "haircut", Arrays.asList( "bald", "regular" ), "bald");
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -47,7 +52,11 @@ public class SerbianNormalizationFilterF
@Override
public TokenStream create(TokenStream input) {
- return new SerbianNormalizationFilter(input);
+ if( this.haircut.equals( "regular" ) ) {
+ return new SerbianNormalizationRegularFilter(input);
+ } else {
+ return new SerbianNormalizationFilter(input);
+ }
}
@Override
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java?rev=1713712&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java Tue Nov 10 18:45:42 2015
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes Serbian Cyrillic to Latin.
+ *
+ * Note that it expects lowercased input.
+ */
+public final class SerbianNormalizationRegularFilter extends TokenFilter {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public SerbianNormalizationRegularFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char buffer[] = termAtt.buffer();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++) {
+ final char c = buffer[i];
+ switch(c) {
+ case 'а':
+ buffer[i] = 'a';
+ break;
+ case 'б':
+ buffer[i] = 'b';
+ break;
+ case 'в':
+ buffer[i] = 'v';
+ break;
+ case 'г':
+ buffer[i] = 'g';
+ break;
+ case 'д':
+ buffer[i] = 'd';
+ break;
+ case 'Ñ':
+ buffer[i] = 'Ä';
+ break;
+ case 'е':
+ buffer[i] = 'e';
+ break;
+ case 'ж':
+ buffer[i] = 'ž';
+ break;
+ case 'з':
+ buffer[i] = 'z';
+ break;
+ case 'и':
+ buffer[i] = 'i';
+ break;
+ case 'Ñ':
+ buffer[i] = 'j';
+ break;
+ case 'к':
+ buffer[i] = 'k';
+ break;
+ case 'л':
+ buffer[i] = 'l';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'l';
+ buffer[++i] = 'j';
+ length++;
+ break;
+ case 'м':
+ buffer[i] = 'm';
+ break;
+ case 'н':
+ buffer[i] = 'n';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'n';
+ buffer[++i] = 'j';
+ length++;
+ break;
+ case 'о':
+ buffer[i] = 'o';
+ break;
+ case 'п':
+ buffer[i] = 'p';
+ break;
+ case 'Ñ':
+ buffer[i] = 'r';
+ break;
+ case 'Ñ':
+ buffer[i] = 's';
+ break;
+ case 'Ñ':
+ buffer[i] = 't';
+ break;
+ case 'Ñ':
+ buffer[i] = 'Ä';
+ break;
+ case 'Ñ':
+ buffer[i] = 'u';
+ break;
+ case 'Ñ':
+ buffer[i] = 'f';
+ break;
+ case 'Ñ
':
+ buffer[i] = 'h';
+ break;
+ case 'Ñ':
+ buffer[i] = 'c';
+ break;
+ case 'Ñ':
+ buffer[i] = 'Ä';
+ break;
+ case 'Ñ':
+ buffer = termAtt.resizeBuffer(1+length);
+ if (i < length) {
+ System.arraycopy(buffer, i, buffer, i+1, (length-i));
+ }
+ buffer[i] = 'd';
+ buffer[++i] = 'ž';
+ length++;
+ break;
+ case 'Ñ':
+ buffer[i] = 'Å¡';
+ break;
+ default:
+ break;
+ }
+ }
+ termAtt.setLength(length);
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java Tue Nov 10 18:45:42 2015
@@ -36,7 +36,15 @@ public class TestSerbianNormalizationFil
stream = tokenFilterFactory("SerbianNormalization").create(stream);
assertTokenStreamContents(stream, new String[] { "djura" });
}
-
+
+ public void testRegularStemming() throws Exception {
+ Reader reader = new StringReader("ÑÑÑа");
+ TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ ((Tokenizer)stream).setReader(reader);
+ stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream);
+ assertTokenStreamContents(stream, new String[] { "Äura" });
+ }
+
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
@@ -46,4 +54,5 @@ public class TestSerbianNormalizationFil
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
+
}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java?rev=1713712&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java Tue Nov 10 18:45:42 2015
@@ -0,0 +1,84 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests {@link SerbianNormalizationFilter}
+ */
+public class TestSerbianNormalizationRegularFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ analyzer.close();
+ super.tearDown();
+ }
+
+ /**
+ * Tests Cyrillic text.
+ */
+ public void testCyrillic() throws IOException {
+ checkOneTerm(analyzer, "абвгдÑежзиÑклÑмнÑопÑÑÑÑÑÑÑ
ÑÑÑÑ", "abvgdÄežzijklljmnnjoprstÄufhcÄdžš");
+ }
+
+ /**
+ * Tests Latin text.
+ */
+ public void testLatin() throws IOException {
+ checkOneTerm(analyzer, "abcÄÄddžÄefghijklljmnnjoprsÅ¡tuvzž", "abcÄÄddžÄefghijklljmnnjoprsÅ¡tuvzž");
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer));
+ }
+ };
+ checkOneTerm(a, "", "");
+ a.close();
+ }
+}