You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2015/11/10 19:45:42 UTC

svn commit: r1713712 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/sr/ analysis/common/src/test/org/apache/lucene/analysis/sr/

Author: dweiss
Date: Tue Nov 10 18:45:42 2015
New Revision: 1713712

URL: http://svn.apache.org/viewvc?rev=1713712&view=rev
Log:
LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, Dawid Weiss)

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 10 18:45:42 2015
@@ -96,6 +96,9 @@ Changes in Runtime Behavior
 
 New Features
 
+* LUCENE-6875: New Serbian normalization filter. (Nikola Smolenski via 
+  Robert Muir, Dawid Weiss)
+
 * LUCENE-6720: New FunctionRangeQuery wrapper around ValueSourceScorer
   (returned from ValueSource/FunctionValues.getRangeScorer()). (David Smiley)
 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java Tue Nov 10 18:45:42 2015
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.sr;
  * limitations under the License.
  */
 
+import java.util.Arrays;
 import java.util.Map;
 
 import org.apache.lucene.analysis.TokenStream;
@@ -31,15 +32,19 @@ import org.apache.lucene.analysis.util.T
  *   <analyzer>
  *     <tokenizer class="solr.StandardTokenizerFactory"/>
  *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.SerbianNormalizationFilterFactory"/>
+ *     <filter class="solr.SerbianNormalizationFilterFactory"
+ *       haircut="bald"/> 
  *   </analyzer>
  * &lt;/fieldType&gt;</pre> 
  */
 public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  final String haircut;
 
   /** Creates a new SerbianNormalizationFilterFactory */
   public SerbianNormalizationFilterFactory(Map<String,String> args) {
     super(args);
+
+	this.haircut = get(args, "haircut", Arrays.asList( "bald", "regular" ), "bald");
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -47,7 +52,11 @@ public class SerbianNormalizationFilterF
 
   @Override
   public TokenStream create(TokenStream input) {
-    return new SerbianNormalizationFilter(input);
+	if( this.haircut.equals( "regular" ) ) {
+	    return new SerbianNormalizationRegularFilter(input);
+	} else {
+	    return new SerbianNormalizationFilter(input);
+	}
   }
 
   @Override

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java?rev=1713712&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java Tue Nov 10 18:45:42 2015
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes Serbian Cyrillic to Latin.
+ *
+ * Note that it expects lowercased input.
+ */
+public final class SerbianNormalizationRegularFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  public SerbianNormalizationRegularFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char buffer[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char c = buffer[i];
+        switch(c) {
+        case 'а':
+          buffer[i] = 'a';
+          break;
+        case 'б':
+          buffer[i] = 'b';
+          break;
+        case 'в':
+          buffer[i] = 'v';
+          break;
+        case 'г':
+          buffer[i] = 'g';
+          break;
+        case 'д':
+          buffer[i] = 'd';
+          break;
+        case 'ђ':
+          buffer[i] = 'đ';
+          break;
+        case 'е':
+          buffer[i] = 'e';
+          break;
+        case 'ж':
+          buffer[i] = 'ž';
+          break;
+        case 'з':
+          buffer[i] = 'z';
+          break;
+        case 'и':
+          buffer[i] = 'i';
+          break;
+        case 'ј':
+          buffer[i] = 'j';
+          break;
+        case 'к':
+          buffer[i] = 'k';
+          break;
+        case 'л':
+          buffer[i] = 'l';
+          break;
+        case 'љ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'l';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'м':
+          buffer[i] = 'm';
+          break;
+        case 'н':
+          buffer[i] = 'n';
+          break;
+        case 'њ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'n';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'о':
+          buffer[i] = 'o';
+          break;
+        case 'п':
+          buffer[i] = 'p';
+          break;
+        case 'р':
+          buffer[i] = 'r';
+          break;
+        case 'с':
+          buffer[i] = 's';
+          break;
+        case 'т':
+          buffer[i] = 't';
+          break;
+        case 'ћ':
+          buffer[i] = 'ć';
+          break;
+        case 'у':
+          buffer[i] = 'u';
+          break;
+        case 'ф':
+          buffer[i] = 'f';
+          break;
+        case 'х':
+          buffer[i] = 'h';
+          break;
+        case 'ц':
+          buffer[i] = 'c';
+          break;
+        case 'ч':
+          buffer[i] = 'č';
+          break;
+        case 'џ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'd';
+          buffer[++i] = 'ž';
+          length++;
+          break;
+        case 'ш':
+          buffer[i] = 'Å¡';
+          break;
+        default:
+          break;
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java?rev=1713712&r1=1713711&r2=1713712&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java Tue Nov 10 18:45:42 2015
@@ -36,7 +36,15 @@ public class TestSerbianNormalizationFil
     stream = tokenFilterFactory("SerbianNormalization").create(stream);
     assertTokenStreamContents(stream, new String[] { "djura" });
   }
-  
+
+  public void testRegularStemming() throws Exception {
+    Reader reader = new StringReader("ђура");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer)stream).setReader(reader);
+    stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream);
+    assertTokenStreamContents(stream, new String[] { "đura" });
+  }
+   
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     try {
@@ -46,4 +54,5 @@ public class TestSerbianNormalizationFil
       assertTrue(expected.getMessage().contains("Unknown parameters"));
     }
   }
+
 }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java?rev=1713712&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java Tue Nov 10 18:45:42 2015
@@ -0,0 +1,84 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests {@link SerbianNormalizationFilter}
+ */
+public class TestSerbianNormalizationRegularFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    analyzer.close();
+    super.tearDown();
+  }
+  
+  /**
+   * Tests Cyrillic text.
+   */
+  public void testCyrillic() throws IOException {
+    checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgdđežzijklljmnnjoprstćufhcčdžš");
+  }
+
+  /**
+   * Tests Latin text.
+   */
+  public void testLatin() throws IOException {
+    checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcčćddžđefghijklljmnnjoprsštuvzž");
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}