You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2014/06/12 00:45:54 UTC
svn commit: r1602044 - in /commons/proper/codec/trunk: ./ src/changes/
src/main/java/org/apache/commons/codec/language/bm/
src/main/resources/org/apache/commons/codec/language/bm/
src/test/java/org/apache/commons/codec/language/bm/
Author: ggregory
Date: Wed Jun 11 22:45:54 2014
New Revision: 1602044
URL: http://svn.apache.org/r1602044
Log:
<action dev="ggregory" type="fix" issue="CODEC-187" due-to="Michael Tobias, Thomas Neidhart">Beider Morse Phonetic Matching producing incorrect tokens</action>
Modified:
commons/proper/codec/trunk/NOTICE.txt
commons/proper/codec/trunk/src/changes/changes.xml
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
Modified: commons/proper/codec/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/NOTICE.txt?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/NOTICE.txt (original)
+++ commons/proper/codec/trunk/NOTICE.txt Wed Jun 11 22:45:54 2014
@@ -7,3 +7,11 @@ The Apache Software Foundation (http://w
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
contains test data from http://aspell.net/test/orig/batch0.tab.
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
+
+===============================================================================
+
+The content of package org.apache.commons.codec.language.bm has been translated
+from the original php source code available at http://stevemorse.org/phoneticinfo.htm
+with permission from the original authors.
+Original source copyright:
+Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Wed Jun 11 22:45:54 2014
@@ -43,6 +43,7 @@ The <action> type attribute can be add,u
</properties>
<body>
<release version="1.10" date="DD Mmmm 2014" description="Feature and fix release.">
+ <action dev="ggregory" type="fix" issue="CODEC-187" due-to="Michael Tobias, Thomas Neidhart">Beider Morse Phonetic Matching producing incorrect tokens</action>
<action dev="ggregory" type="fix" issue="CODEC-184" due-to="Cyrille Artho">NullPointerException in DoubleMetaPhone.isDoubleMetaphoneEqual when using empty strings</action>
<action dev="ggregory" type="add" issue="CODEC-181" due-to="Ivan Martinez-Ortiz">Make possible to provide padding byte to BaseNCodec in constructor</action>
<action dev="ggregory" type="fix" issue="CODEC-180" due-to="Ville Skyttä">Fix Javadoc 1.8.0 errors</action>
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Wed Jun 11 22:45:54 2014
@@ -65,6 +65,8 @@ import org.apache.commons.codec.StringEn
* Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
* splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
*
+ * @see <a href="http://stevemorse.org/phonetics/bmpm.htm">Beider-Morse Phonetic Matching</a>
+ * @see <a href="http://stevemorse.org/phoneticinfo.htm">Reference implementation</a>
* @since 1.6
* @version $Id$
*/
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_approx_any.txt Wed Jun 11 22:45:54 2014
@@ -99,7 +99,7 @@
"i" "" "[aeou]" "j"
"y" "[aáuiÃoóeéê]" "" "j"
"y" "" "[aeiÃou]" "j"
-"e" "" "$" "(e|E[$french])"
+"e" "" "$" "(e|E[french])"
"ão" "" "" "(au|an)" // Port
"ãe" "" "" "(aj|an)" // Port
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/ash_rules_any.txt Wed Jun 11 22:45:54 2014
@@ -84,7 +84,7 @@
"cz" "" "" "tS" // Polish
"cia" "" "[bcdgkpstwzż]" "(tSB[polish]|tsB)"
-"cia" "" "" "(tSa[$polish]|tsa)"
+"cia" "" "" "(tSa[polish]|tsa)"
"ciÄ
" "" "[bp]" "(tSom[polish]|tsom)"
"ciÄ
" "" "" "(tSon[polish]|tson)"
"ciÄ" "" "[bp]" "(tSem[polish]|tsem)"
Modified: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt (original)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/bm/gen_approx_any.txt Wed Jun 11 22:45:54 2014
@@ -41,7 +41,7 @@
"s" "" "$" "(s|[french])"
"t" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(t|[french])" // Petitjean
"s" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(s|[french])" // Groslot, Grosleau
- //array("p" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(p|[$french])"
+ //array("p" "[aeiouAEIOU]" "[^aeiouAEIOU]" "(p|[french])"
"I" "[aeiouAEIBFOUQY]" "" "i"
"I" "" "[^aeiouAEBFIOU]e" "(Q[german]|i|D[english])" // "line"
@@ -86,7 +86,7 @@
"E" "" "$" "i"
"E" "[DaoiuAOIUQY]" "" "i"
"E" "" "[aoAOQY]" "i"
-"E" "" "" "(i|Y[$german])"
+"E" "" "" "(i|Y[german])"
"P" "" "" "(o|u)"
@@ -94,14 +94,14 @@
"O" "" "ts$" "o"
"O" "" "$" "o"
"O" "[oeiuQY]" "" "o"
-"O" "" "" "(o|Y[$german])"
+"O" "" "" "(o|Y[german])"
"O" "" "" "o"
"A" "" "[fklmnprst]$" "(a|o)"
"A" "" "ts$" "(a|o)"
"A" "" "$" "(a|o)"
"A" "[oeiuQY]" "" "(a|o)"
-"A" "" "" "(a|o|Y[$german])"
+"A" "" "" "(a|o|Y[german])"
"A" "" "" "(a|o)"
"U" "" "$" "u"
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java?rev=1602044&r1=1602043&r2=1602044&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineRegressionTest.java Wed Jun 11 22:45:54 2014
@@ -40,7 +40,7 @@ public class PhoneticEngineRegressionTes
// concat is true, ruleType is EXACT
args = new TreeMap<String, String>();
args.put("nameType", "GENERIC");
- assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
args.put("ruleType", "EXACT");
assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
@@ -50,7 +50,7 @@ public class PhoneticEngineRegressionTes
// concat is false, ruleType is EXACT
args = new TreeMap<String, String>();
- assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
args.put("ruleType", "EXACT");
assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
@@ -60,20 +60,20 @@ public class PhoneticEngineRegressionTes
// concat is true, ruleType is APPROX
args = new TreeMap<String, String>();
- assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
args.put("ruleType", "APPROX");
- assertEquals(encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
- assertEquals(encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+ assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, true, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
args.put("languageSet", "italian,greek,spanish");
assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
assertEquals(encode(args, true, "1234"), "");
// concat is false, ruleType is APPROX
args = new TreeMap<String, String>();
- assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
args.put("ruleType", "APPROX");
- assertEquals(encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo");
- assertEquals(encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)");
+ assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
+ assertEquals(encode(args, false, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
args.put("languageSet", "italian,greek,spanish");
assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
assertEquals(encode(args, false, "1234"), "");
@@ -177,6 +177,17 @@ public class PhoneticEngineRegressionTes
assertEquals(encode(args, false, "1234"), "");
}
+ @Test
+ public void testCompatibilityWithOriginalVersion() {
+ // see CODEC-187
+ // comparison: http://stevemorse.org/census/soundex.html
+
+ Map<String, String> args = new TreeMap<String, String>();
+ args.put("nameType", "GENERIC");
+ args.put("ruleType", "APPROX");
+ assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
+ }
+
/**
* This code is similar in style to code found in Solr:
* solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
Re: svn commit: r1602044 - in /commons/proper/codec/trunk: ./ src/changes/
src/main/java/org/apache/commons/codec/language/bm/ src/main/resources/org/apache/commons/codec/language/bm/
src/test/java/org/apache/commons/codec/language/bm/
Posted by Emmanuel Bourg <eb...@apache.org>.
Le 12/06/2014 14:25, sebb a écrit :
> Please don't use XML for log messages.
>
> It may be easier to copy/paste the entry, but it's a lot harder for
> everyone else to read.
> And there are more readers than writers ...
+1
Emmanuel Bourg
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
For additional commands, e-mail: dev-help@commons.apache.org
Re: svn commit: r1602044 - in /commons/proper/codec/trunk: ./
src/changes/ src/main/java/org/apache/commons/codec/language/bm/
src/main/resources/org/apache/commons/codec/language/bm/ src/test/java/org/apache/commons/codec/language/bm/
Posted by sebb <se...@gmail.com>.
On 11 June 2014 23:45, <gg...@apache.org> wrote:
> Author: ggregory
> Date: Wed Jun 11 22:45:54 2014
> New Revision: 1602044
>
> URL: http://svn.apache.org/r1602044
> Log:
> <action dev="ggregory" type="fix" issue="CODEC-187" due-to="Michael Tobias, Thomas Neidhart">Beider Morse Phonetic Matching producing incorrect tokens</action>
Please don't use XML for log messages.
It may be easier to copy/paste the entry, but it's a lot harder for
everyone else to read.
And there are more readers than writers ...
> Modified:
Please fix up any XML log messages.
Thanks.
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@commons.apache.org
For additional commands, e-mail: dev-help@commons.apache.org