You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2012/08/23 22:20:44 UTC

svn commit: r1376676 - in /commons/proper/codec/trunk/src: changes/changes.xml test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java

Author: tn
Date: Thu Aug 23 20:20:44 2012
New Revision: 1376676

URL: http://svn.apache.org/viewvc?rev=1376676&view=rev
Log:
[CODEC-146] Added regression tests for PhoneticEngine. Thanks to Julius Davies.

Added:
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java   (with props)
Modified:
    commons/proper/codec/trunk/src/changes/changes.xml

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1376676&r1=1376675&r2=1376676&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Thu Aug 23 20:20:44 2012
@@ -51,6 +51,9 @@ The <action> type attribute can be add,u
     </release>
     -->
     <release version="1.7" date="TBD" description="Feature and fix release.">
+      <action issue="CODEC-146" dev="tn" type="add" due-to="Julius Davies">
+        Added regression tests for PhoneticEngine based on Solr-3.6.0.
+      </action>
       <action issue="CODEC-147" dev="tn" type="update">
         BeiderMorseEncoder/PhoneticEngine: make results deterministic by using a LinkedHashSet
         instead of a HashSet.

Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1376676&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Thu Aug 23 20:20:44 2012
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language.bm;
+
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.TreeMap;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
+ *
+ * @since 1.7
+ */
+public class PhoneticEngineRegressionTest {
+
+    @Test
+    public void testSolrGENERIC() {
+        Map<String, String> args;
+
+        // concat is true, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        args.put("nameType", "GENERIC");
+        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
+        assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
+        assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
+        assertEquals(encode(args, false, "1234"), "");
+
+        // concat is true, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+        assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
+        assertEquals(encode(args, false, "1234"), "");
+    }
+
+    @Test
+    public void testSolrASHKENAZI() {
+        Map<String, String> args;
+
+        // concat is true, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        args.put("nameType", "ASHKENAZI");
+        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
+        assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "angelo|anxelo");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        args.put("nameType", "ASHKENAZI");
+        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
+        assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "angelo|anxelo");
+        assertEquals(encode(args, false, "1234"), "");
+
+        // concat is true, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        args.put("nameType", "ASHKENAZI");
+        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        assertEquals(encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        args.put("nameType", "ASHKENAZI");
+        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO");
+        assertEquals(encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO");
+        assertEquals(encode(args, false, "1234"), "");
+    }
+
+    @Test
+    public void testSolrSEPHARDIC() {
+        Map<String, String> args;
+
+        // concat is true, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        args.put("nameType", "SEPHARDIC");
+        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
+        assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is EXACT
+        args = new TreeMap<String, String>();
+        args.put("nameType", "SEPHARDIC");
+        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        args.put("ruleType", "EXACT");
+        assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
+        assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo");
+        assertEquals(encode(args, false, "1234"), "");
+
+        // concat is true, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        args.put("nameType", "SEPHARDIC");
+        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        assertEquals(encode(args, true, "1234"), "");
+
+        // concat is false, ruleType is APPROX
+        args = new TreeMap<String, String>();
+        args.put("nameType", "SEPHARDIC");
+        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        args.put("ruleType", "APPROX");
+        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
+        args.put("languageSet", "italian,greek,spanish");
+        assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
+        assertEquals(encode(args, false, "1234"), "");
+    }
+
+    /**
+     * This code is similar in style to code found in Solr:
+     * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
+     *
+     * Making a JUnit test out of it to protect Solr from possible future
+     * regressions in Commons-Codec.
+     */
+    private static String encode(Map<String, String> args, boolean concat, String input) {
+        Languages.LanguageSet languageSet;
+        PhoneticEngine engine;
+
+        // PhoneticEngine = NameType + RuleType + concat
+        // we use common-codec's defaults: GENERIC + APPROX + true
+        String nameTypeArg = args.get("nameType");
+        NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
+
+        String ruleTypeArg = args.get("ruleType");
+        RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
+
+        engine = new PhoneticEngine(nameType, ruleType, concat);
+
+        // LanguageSet: defaults to automagic, otherwise a comma-separated list.
+        String languageSetArg = args.get("languageSet");
+        if (languageSetArg == null || languageSetArg.equals("auto")) {
+            languageSet = null;
+        } else {
+            languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
+        }
+
+        /*
+            org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
+
+            encoded = (languages == null)
+                ? engine.encode(termAtt.toString())
+                : engine.encode(termAtt.toString(), languages);
+
+            Hence our approach, below:
+        */
+        if (languageSet == null) {
+            return engine.encode(input);
+        } else {
+            return engine.encode(input, languageSet);
+        }
+    }
+}

Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision HeadURL

Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain