You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2014/11/04 03:18:12 UTC
svn commit: r1636486 - in /commons/proper/codec/trunk/src: changes/
main/java/org/apache/commons/codec/language/
main/resources/org/apache/commons/codec/language/
test/java/org/apache/commons/codec/language/
Author: ggregory
Date: Tue Nov 4 02:18:12 2014
New Revision: 1636486
URL: http://svn.apache.org/r1636486
Log:
[CODEC-192] Add Daitch–Mokotoff Soundex.
Added:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java (with props)
commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt (with props)
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java (with props)
Modified:
commons/proper/codec/trunk/src/changes/changes.xml
Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1636486&r1=1636485&r2=1636486&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Tue Nov 4 02:18:12 2014
@@ -43,6 +43,7 @@ The <action> type attribute can be add,u
</properties>
<body>
<release version="1.10" date="DD Mmmm 2014" description="Feature and fix release.">
+ <action dev="ggregory" type="add" issue="CODEC-192" due-to="Thomas Neidhart">Add DaitchâMokotoff Soundex</action>
<action dev="tn" type="fix" issue="CODEC-185" due-to="Sean Busbey">Added clarification to javadoc of Base64 concerning the use of the urlSafe parameter</action>
<action dev="tn" type="fix" issue="CODEC-191" due-to="Igor Savin">Added clarification to the javadoc of Base[32|64]OutputStream that it is mandatory to call close()</action>
<action dev="ggregory" type="fix" issue="CODEC-188" due-to="Hendrik Saly">Add support for HMAC Message Authentication Code (MAC) digests</action>
Added: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java?rev=1636486&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java (added)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java Tue Nov 4 02:18:12 2014
@@ -0,0 +1,554 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+
+import org.apache.commons.codec.CharEncoding;
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Daitch-Mokotoff Soundex value.
+ * <p>
+ * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater
+ * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.
+ * <p>
+ * The main differences compared to the other soundex variants are:
+ * <ul>
+ * <li>coded names are 6 digits long
+ * <li>the initial character of the name is coded
+ * <li>rules to encoded multi-character n-grams
+ * <li>multiple possible encodings for the same name (branching)
+ * </ul>
+ * <p>
+ * This implementation supports branching, depending on the used method:
+ * <ul>
+ * <li>{@link #encode(String)} - branching disabled, only the first code will be returned
+ * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|'
+ * </ul>
+ * <p>
+ * Note: this implementation has additional branching rules compared to the original description of the algorithm. The
+ * rules can be customized by overriding the default rules contained in the resource file
+ * {@code org/apache/commons/codec/language/dmrules.txt}.
+ * <p>
+ * This class is thread-safe.
+ *
+ * @see Soundex
+ * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a>
+ * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a>
+ *
+ * @version $Id$
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundex implements StringEncoder {
+
+ /**
+ * Inner class representing a branch during DM soundex encoding.
+ */
+ private static final class Branch {
+ private final StringBuilder builder;
+ private String cachedString;
+ private String lastReplacement;
+
+ private Branch() {
+ builder = new StringBuilder();
+ lastReplacement = null;
+ cachedString = null;
+ }
+
+ /**
+ * Creates a new branch, identical to this branch.
+ *
+ * @return a new, identical branch
+ */
+ public Branch createBranch() {
+ final Branch branch = new Branch();
+ branch.builder.append(toString());
+ branch.lastReplacement = this.lastReplacement;
+ return branch;
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof Branch)) {
+ return false;
+ }
+
+ return toString().equals(((Branch) other).toString());
+ }
+
+ /**
+ * Finish this branch by appending '0's until the maximum code length has been reached.
+ */
+ public void finish() {
+ while (builder.length() < MAX_LENGTH) {
+ builder.append('0');
+ cachedString = null;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode();
+ }
+
+ /**
+ * Process the next replacement to be added to this branch.
+ *
+ * @param replacement
+ * the next replacement to append
+ * @param forceAppend
+ * indicates if the default processing shall be overridden
+ */
+ public void processNextReplacement(final String replacement, final boolean forceAppend) {
+ final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend;
+
+ if (append && builder.length() < MAX_LENGTH) {
+ builder.append(replacement);
+ // remove all characters after the maximum length
+ if (builder.length() > MAX_LENGTH) {
+ builder.delete(MAX_LENGTH, builder.length());
+ }
+ cachedString = null;
+ }
+
+ lastReplacement = replacement;
+ }
+
+ @Override
+ public String toString() {
+ if (cachedString == null) {
+ cachedString = builder.toString();
+ }
+ return cachedString;
+ }
+ }
+
+ // static identifiers used during parsing of the rule file
+
+ /**
+ * Inner class for storing rules.
+ */
+ private static final class Rule {
+ private final String pattern;
+ private final String[] replacementAtStart;
+ private final String[] replacementBeforeVowel;
+ private final String[] replacementDefault;
+
+ protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel,
+ final String replacementDefault) {
+ this.pattern = pattern;
+ this.replacementAtStart = replacementAtStart.split("\\|");
+ this.replacementBeforeVowel = replacementBeforeVowel.split("\\|");
+ this.replacementDefault = replacementDefault.split("\\|");
+ }
+
+ public int getPatternLength() {
+ return pattern.length();
+ }
+
+ public String[] getReplacements(final String context, final boolean atStart) {
+ if (atStart) {
+ return replacementAtStart;
+ }
+
+ final int nextIndex = getPatternLength();
+ final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false;
+ if (nextCharIsVowel) {
+ return replacementBeforeVowel;
+ }
+
+ return replacementDefault;
+ }
+
+ private boolean isVowel(final char ch) {
+ return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u';
+ }
+
+ public boolean matches(final String context) {
+ return context.startsWith(pattern);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart),
+ Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault));
+ }
+ }
+
+ private static final String COMMENT = "//";
+ private static final String DOUBLE_QUOTE = "\"";
+ /** Folding rules. */
+ private static final Map<Character, Character> FOLDINGS = new HashMap<Character, Character>();
+
+ /** The code length of a DM soundex value. */
+ private static final int MAX_LENGTH = 6;
+ private static final String MULTILINE_COMMENT_END = "*/";
+
+ private static final String MULTILINE_COMMENT_START = "/*";
+
+ /** The resource file containing the replacement and folding rules */
+ private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
+
+ /** Transformation rules indexed by the first character of their pattern. */
+ private static final Map<Character, List<Rule>> RULES = new HashMap<Character, List<Rule>>();
+
+ static {
+ final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE);
+ if (rulesIS == null) {
+ throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE);
+ }
+
+ final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8);
+ parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS);
+ scanner.close();
+
+ // sort RULES by pattern length in descending order
+ for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) {
+ final List<Rule> ruleList = rule.getValue();
+ Collections.sort(ruleList, new Comparator<Rule>() {
+ @Override
+ public int compare(final Rule rule1, final Rule rule2) {
+ return rule2.getPatternLength() - rule1.getPatternLength();
+ }
+ });
+ }
+ }
+
+ private static void parseRules(final Scanner scanner, final String location,
+ final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) {
+ int currentLine = 0;
+ boolean inMultilineComment = false;
+
+ while (scanner.hasNextLine()) {
+ currentLine++;
+ final String rawLine = scanner.nextLine();
+ String line = rawLine;
+
+ if (inMultilineComment) {
+ if (line.endsWith(MULTILINE_COMMENT_END)) {
+ inMultilineComment = false;
+ }
+ continue;
+ }
+
+ if (line.startsWith(MULTILINE_COMMENT_START)) {
+ inMultilineComment = true;
+ } else {
+ // discard comments
+ final int cmtI = line.indexOf(COMMENT);
+ if (cmtI >= 0) {
+ line = line.substring(0, cmtI);
+ }
+
+ // trim leading-trailing whitespace
+ line = line.trim();
+
+ if (line.length() == 0) {
+ continue; // empty lines can be safely skipped
+ }
+
+ if (line.contains("=")) {
+ // folding
+ final String[] parts = line.split("=");
+ if (parts.length != 2) {
+ throw new IllegalArgumentException("Malformed folding statement split into " + parts.length +
+ " parts: " + rawLine + " in " + location);
+ } else {
+ final String leftCharacter = parts[0];
+ final String rightCharacter = parts[1];
+
+ if (leftCharacter.length() != 1 || rightCharacter.length() != 1) {
+ throw new IllegalArgumentException("Malformed folding statement - " +
+ "patterns are not single characters: " + rawLine + " in " + location);
+ }
+
+ asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0));
+ }
+ } else {
+ // rule
+ final String[] parts = line.split("\\s+");
+ if (parts.length != 4) {
+ throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
+ " parts: " + rawLine + " in " + location);
+ } else {
+ try {
+ final String pattern = stripQuotes(parts[0]);
+ final String replacement1 = stripQuotes(parts[1]);
+ final String replacement2 = stripQuotes(parts[2]);
+ final String replacement3 = stripQuotes(parts[3]);
+
+ final Rule r = new Rule(pattern, replacement1, replacement2, replacement3);
+ final char patternKey = r.pattern.charAt(0);
+ List<Rule> rules = ruleMapping.get(patternKey);
+ if (rules == null) {
+ rules = new ArrayList<Rule>();
+ ruleMapping.put(patternKey, rules);
+ }
+ rules.add(r);
+ } catch (final IllegalArgumentException e) {
+ throw new IllegalStateException(
+ "Problem parsing line '" + currentLine + "' in " + location, e);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static String stripQuotes(String str) {
+ if (str.startsWith(DOUBLE_QUOTE)) {
+ str = str.substring(1);
+ }
+
+ if (str.endsWith(DOUBLE_QUOTE)) {
+ str = str.substring(0, str.length() - 1);
+ }
+
+ return str;
+ }
+
+ /** Whether to use ascii folding prior to encoding. */
+ private final boolean folding;
+
+ /**
+ * Creates a new instance with ascii-folding enabled.
+ */
+ public DaitchMokotoffSoundex() {
+ this(true);
+ }
+
+ /**
+ * Creates a new instance.
+ * <p>
+ * With ascii-folding enabled, certain accented characters will be transformed to equivalent ascii characters, e.g.
+ * è -> e.
+ *
+ * @param folding
+ * if ascii-folding shall be performed before encoding
+ */
+ public DaitchMokotoffSoundex(final boolean folding) {
+ this.folding = folding;
+ }
+
+ /**
+ * Performs a cleanup of the input string before the actual soundex transformation.
+ * <p>
+ * Removes all whitespace characters and performs ascii folding if enabled.
+ *
+ * @param input
+ * the input string to cleanup
+ * @return a cleaned up string
+ */
+ private String cleanup(final String input) {
+ final StringBuilder sb = new StringBuilder();
+ for (char ch : input.toCharArray()) {
+ if (Character.isWhitespace(ch)) {
+ continue;
+ }
+
+ ch = Character.toLowerCase(ch);
+ if (folding && FOLDINGS.containsKey(ch)) {
+ ch = FOLDINGS.get(ch);
+ }
+ sb.append(ch);
+ }
+ return sb.toString();
+ }
+
+ // -- BEGIN STATIC METHODS --//
+
+ /**
+ * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.
+ * <p>
+ * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
+ * EncoderException if the supplied object is not of type java.lang.String.
+ *
+ * @see #soundex(String)
+ *
+ * @param obj
+ * Object to encode
+ * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String
+ * supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ @Override
+ public Object encode(final Object obj) throws EncoderException {
+ if (!(obj instanceof String)) {
+ throw new EncoderException(
+ "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String");
+ }
+ return encode((String) obj);
+ }
+
+ /**
+ * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
+ *
+ * @see #soundex(String)
+ *
+ * @param str
+ * A String object to encode
+ * @return A DM Soundex code corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ @Override
+ public String encode(final String source) {
+ if (source == null) {
+ return null;
+ }
+ return soundex(source, false)[0];
+ }
+
+ // -- BEGIN INNER CLASSES --//
+
+ /**
+ * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.
+ * <p>
+ * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes,
+ * separated by '|'.
+ * <p>
+ * Example: the name "AUERBACH" is encoded as both
+ * <ul>
+ * <li>097400</li>
+ * <li>097500</li>
+ * </ul>
+ * Thus the result will be "097400|097500".
+ *
+ * @param str
+ * A String object to encode
+ * @return A string containing a set of DM Soundex codes corresponding to the String supplied
+ * @throws IllegalArgumentException
+ * if a character is not mapped
+ */
+ public String soundex(final String source) {
+ final String[] branches = soundex(source, true);
+ final StringBuilder sb = new StringBuilder();
+ int index = 0;
+ for (final String branch : branches) {
+ sb.append(branch);
+ if (++index < branches.length) {
+ sb.append('|');
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Perform the actual DM soundex algorithm on the input string.
+ *
+ * @param source
+ * A String object to encode
+ * @param branching
+ * If branching shall be performed
+ * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the
+ * selected branching mode
+ */
+ private String[] soundex(final String source, final boolean branching) {
+ if (source == null) {
+ return null;
+ }
+
+ final String input = cleanup(source);
+
+ final Set<Branch> currentBranches = new LinkedHashSet<Branch>();
+ currentBranches.add(new Branch());
+
+ char lastChar = '\0';
+ for (int index = 0; index < input.length(); index++) {
+ final char ch = input.charAt(index);
+
+ // ignore whitespace inside a name
+ if (Character.isWhitespace(ch)) {
+ continue;
+ }
+
+ final String inputContext = input.substring(index);
+ final List<Rule> rules = RULES.get(ch);
+ if (rules == null) {
+ continue;
+ }
+
+ // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access
+ @SuppressWarnings("unchecked")
+ final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST;
+
+ for (final Rule rule : rules) {
+ if (rule.matches(inputContext)) {
+ if (branching) {
+ nextBranches.clear();
+ }
+ final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0');
+ final boolean branchingRequired = replacements.length > 1 && branching;
+
+ for (final Branch branch : currentBranches) {
+ for (final String nextReplacement : replacements) {
+ // if we have multiple replacements, always create a new branch
+ final Branch nextBranch = branchingRequired ? branch.createBranch() : branch;
+
+ // special rule: occurrences of mn or nm are treated differently
+ final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm');
+
+ nextBranch.processNextReplacement(nextReplacement, force);
+
+ if (branching) {
+ nextBranches.add(nextBranch);
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (branching) {
+ currentBranches.clear();
+ currentBranches.addAll(nextBranches);
+ }
+ index += rule.getPatternLength() - 1;
+ break;
+ }
+ }
+
+ lastChar = ch;
+ }
+
+ final String[] result = new String[currentBranches.size()];
+ int index = 0;
+ for (final Branch branch : currentBranches) {
+ branch.finish();
+ result[index++] = branch.toString();
+ }
+
+ return result;
+ }
+}
Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt?rev=1636486&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt (added)
+++ commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt Tue Nov 4 02:18:12 2014
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Format
+// "pattern" "replacement at start of word" "replacement before a vowel" "replacement in other cases"
+
+// Vowels
+
+"a" "0" "" ""
+"e" "0" "" ""
+"i" "0" "" ""
+"o" "0" "" ""
+"u" "0" "" ""
+
+// Consonants
+
+"b" "7" "7" "7"
+"d" "3" "3" "3"
+"f" "7" "7" "7"
+"g" "5" "5" "5"
+"h" "5" "5" ""
+"k" "5" "5" "5"
+"l" "8" "8" "8"
+"m" "6" "6" "6"
+"n" "6" "6" "6"
+"p" "7" "7" "7"
+"q" "5" "5" "5"
+"r" "9" "9" "9"
+"s" "4" "4" "4"
+"t" "3" "3" "3"
+"v" "7" "7" "7"
+"w" "7" "7" "7"
+"x" "5" "54" "54"
+"y" "1" "" ""
+"z" "4" "4" "4"
+
+// Romanian t-cedilla and t-comma should be equivalent
+"Å£" "3|4" "3|4" "3|4"
+"È" "3|4" "3|4" "3|4"
+
+// Polish characters (e-ogonek and a-ogonek): default case branch either not coded or 6
+"Ä" "" "" "|6"
+"Ä
" "" "" "|6"
+
+// Other terms
+
+"schtsch" "2" "4" "4"
+"schtsh" "2" "4" "4"
+"schtch" "2" "4" "4"
+"shtch" "2" "4" "4"
+"shtsh" "2" "4" "4"
+"stsch" "2" "4" "4"
+"ttsch" "4" "4" "4"
+"zhdzh" "2" "4" "4"
+"shch" "2" "4" "4"
+"scht" "2" "43" "43"
+"schd" "2" "43" "43"
+"stch" "2" "4" "4"
+"strz" "2" "4" "4"
+"strs" "2" "4" "4"
+"stsh" "2" "4" "4"
+"szcz" "2" "4" "4"
+"szcs" "2" "4" "4"
+"ttch" "4" "4" "4"
+"tsch" "4" "4" "4"
+"ttsz" "4" "4" "4"
+"zdzh" "2" "4" "4"
+"zsch" "4" "4" "4"
+"chs" "5" "54" "54"
+"csz" "4" "4" "4"
+"czs" "4" "4" "4"
+"drz" "4" "4" "4"
+"drs" "4" "4" "4"
+"dsh" "4" "4" "4"
+"dsz" "4" "4" "4"
+"dzh" "4" "4" "4"
+"dzs" "4" "4" "4"
+"sch" "4" "4" "4"
+"sht" "2" "43" "43"
+"szt" "2" "43" "43"
+"shd" "2" "43" "43"
+"szd" "2" "43" "43"
+"tch" "4" "4" "4"
+"trz" "4" "4" "4"
+"trs" "4" "4" "4"
+"tsh" "4" "4" "4"
+"tts" "4" "4" "4"
+"ttz" "4" "4" "4"
+"tzs" "4" "4" "4"
+"tsz" "4" "4" "4"
+"zdz" "2" "4" "4"
+"zhd" "2" "43" "43"
+"zsh" "4" "4" "4"
+"ai" "0" "1" ""
+"aj" "0" "1" ""
+"ay" "0" "1" ""
+"au" "0" "7" ""
+"cz" "4" "4" "4"
+"cs" "4" "4" "4"
+"ds" "4" "4" "4"
+"dz" "4" "4" "4"
+"dt" "3" "3" "3"
+"ei" "0" "1" ""
+"ej" "0" "1" ""
+"ey" "0" "1" ""
+"eu" "1" "1" ""
+"fb" "7" "7" "7"
+"ia" "1" "" ""
+"ie" "1" "" ""
+"io" "1" "" ""
+"iu" "1" "" ""
+"ks" "5" "54" "54"
+"kh" "5" "5" "5"
+"mn" "66" "66" "66"
+"nm" "66" "66" "66"
+"oi" "0" "1" ""
+"oj" "0" "1" ""
+"oy" "0" "1" ""
+"pf" "7" "7" "7"
+"ph" "7" "7" "7"
+"sh" "4" "4" "4"
+"sc" "2" "4" "4"
+"st" "2" "43" "43"
+"sd" "2" "43" "43"
+"sz" "4" "4" "4"
+"th" "3" "3" "3"
+"ts" "4" "4" "4"
+"tc" "4" "4" "4"
+"tz" "4" "4" "4"
+"ui" "0" "1" ""
+"uj" "0" "1" ""
+"uy" "0" "1" ""
+"ue" "0" "1" ""
+"zd" "2" "43" "43"
+"zh" "4" "4" "4"
+"zs" "4" "4" "4"
+
+// Branching cases
+
+"c" "4|5" "4|5" "4|5"
+"ch" "4|5" "4|5" "4|5"
+"ck" "5|45" "5|45" "5|45"
+"rs" "4|94" "4|94" "4|94"
+"rz" "4|94" "4|94" "4|94"
+"j" "1|4" "|4" "|4"
+
+
+// ASCII foldings
+
+Ã=s
+Ã =a
+á=a
+â=a
+ã=a
+ä=a
+Ã¥=a
+æ=a
+ç=c
+è=e
+é=e
+ê=e
+ë=e
+ì=i
+Ã=i
+î=i
+ï=i
+ð=d
+ñ=n
+ò=o
+ó=o
+ô=o
+õ=o
+ö=o
+ø=o
+ù=u
+ú=u
+û=u
+ý=y
+ý=y
+þ=b
+ÿ=y
+Ä=c
+Å=l
+Å=s
+ż=z
+ź=z
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/main/resources/org/apache/commons/codec/language/dmrules.txt
------------------------------------------------------------------------------
svn:keywords = Id
Added: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java?rev=1636486&view=auto
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java (added)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java Tue Nov 4 02:18:12 2014
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoderAbstractTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests {@link DaitchMokotoffSoundex}.
+ * <p>
+ * Keep this file in UTF-8 encoding for proper Javadoc processing.
+ * </p>
+ *
+ * @since 1.10
+ */
+public class DaitchMokotoffSoundexTest extends StringEncoderAbstractTest<DaitchMokotoffSoundex> {
+
+ @Override
+ protected DaitchMokotoffSoundex createStringEncoder() {
+ return new DaitchMokotoffSoundex();
+ }
+
+ @Test
+ public void testAccentedCharacterFolding() {
+ Assert.assertEquals("294795", this.getStringEncoder().soundex("StraÃburg"));
+ Assert.assertEquals("294795", this.getStringEncoder().soundex("Strasburg"));
+
+ Assert.assertEquals("095600", this.getStringEncoder().soundex("Ãregon"));
+ Assert.assertEquals("095600", this.getStringEncoder().soundex("Eregon"));
+ }
+
+ @Test
+ public void testAdjacentCodes() {
+ // AKSSOL
+ // A-KS-S-O-L
+ // 0-54-4---8 -> wrong
+ // 0-54-----8 -> correct
+ Assert.assertEquals("054800", this.getStringEncoder().soundex("AKSSOL"));
+
+ // GERSCHFELD
+ // G-E-RS-CH-F-E-L-D
+ // 5--4/94-5/4-7-8-3 -> wrong
+ // 5--4/94-5/--7-8-3 -> correct
+ Assert.assertEquals("547830|545783|594783|594578", this.getStringEncoder().soundex("GERSCHFELD"));
+ }
+
+ public void testEncodeBasic() {
+ // same as above, but without branching
+ Assert.assertEquals("097400", this.getStringEncoder().encode("AUERBACH"));
+ Assert.assertEquals("097400", this.getStringEncoder().encode("OHRBACH"));
+ Assert.assertEquals("874400", this.getStringEncoder().encode("LIPSHITZ"));
+ Assert.assertEquals("874400", this.getStringEncoder().encode("LIPPSZYC"));
+ Assert.assertEquals("876450", this.getStringEncoder().encode("LEWINSKY"));
+ Assert.assertEquals("876450", this.getStringEncoder().encode("LEVINSKI"));
+ Assert.assertEquals("486740", this.getStringEncoder().encode("SZLAMAWICZ"));
+ Assert.assertEquals("486740", this.getStringEncoder().encode("SHLAMOVITZ"));
+ }
+
+ @Test
+ public void testEncodeIgnoreApostrophes() throws EncoderException {
+ this.checkEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
+ "OBri'en", "OBrie'n", "OBrien'" });
+ }
+
+ /**
+ * Test data from http://www.myatt.demon.co.uk/sxalg.htm
+ *
+ * @throws EncoderException
+ */
+ @Test
+ public void testEncodeIgnoreHyphens() throws EncoderException {
+ this.checkEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
+ "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
+ }
+
+ @Test
+ public void testEncodeIgnoreTrimmable() {
+ Assert.assertEquals("746536", this.getStringEncoder().encode(" \t\n\r Washington \t\n\r "));
+ Assert.assertEquals("746536", this.getStringEncoder().encode("Washington"));
+ }
+
+ /**
+ * Examples from http://www.jewishgen.org/infofiles/soundex.html
+ */
+ @Test
+ public void testSoundexBasic() {
+ Assert.assertEquals("583600", this.getStringEncoder().soundex("GOLDEN"));
+ Assert.assertEquals("087930", this.getStringEncoder().soundex("Alpert"));
+ Assert.assertEquals("791900", this.getStringEncoder().soundex("Breuer"));
+ Assert.assertEquals("579000", this.getStringEncoder().soundex("Haber"));
+ Assert.assertEquals("665600", this.getStringEncoder().soundex("Mannheim"));
+ Assert.assertEquals("664000", this.getStringEncoder().soundex("Mintz"));
+ Assert.assertEquals("370000", this.getStringEncoder().soundex("Topf"));
+ Assert.assertEquals("586660", this.getStringEncoder().soundex("Kleinmann"));
+ Assert.assertEquals("769600", this.getStringEncoder().soundex("Ben Aron"));
+
+ Assert.assertEquals("097400|097500", this.getStringEncoder().soundex("AUERBACH"));
+ Assert.assertEquals("097400|097500", this.getStringEncoder().soundex("OHRBACH"));
+ Assert.assertEquals("874400", this.getStringEncoder().soundex("LIPSHITZ"));
+ Assert.assertEquals("874400|874500", this.getStringEncoder().soundex("LIPPSZYC"));
+ Assert.assertEquals("876450", this.getStringEncoder().soundex("LEWINSKY"));
+ Assert.assertEquals("876450", this.getStringEncoder().soundex("LEVINSKI"));
+ Assert.assertEquals("486740", this.getStringEncoder().soundex("SZLAMAWICZ"));
+ Assert.assertEquals("486740", this.getStringEncoder().soundex("SHLAMOVITZ"));
+ }
+
+ /**
+ * Examples from http://www.avotaynu.com/soundex.htm
+ */
+ @Test
+ public void testSoundexBasic2() {
+ Assert.assertEquals("467000|567000", this.getStringEncoder().soundex("Ceniow"));
+ Assert.assertEquals("467000", this.getStringEncoder().soundex("Tsenyuv"));
+ Assert.assertEquals("587400|587500", this.getStringEncoder().soundex("Holubica"));
+ Assert.assertEquals("587400", this.getStringEncoder().soundex("Golubitsa"));
+ Assert.assertEquals("746480|794648", this.getStringEncoder().soundex("Przemysl"));
+ Assert.assertEquals("746480", this.getStringEncoder().soundex("Pshemeshil"));
+ Assert.assertEquals("944744|944745|944754|944755|945744|945745|945754|945755",
+ this.getStringEncoder().soundex("Rosochowaciec"));
+ Assert.assertEquals("945744", this.getStringEncoder().soundex("Rosokhovatsets"));
+ }
+
+ /**
+ * Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex
+ */
+ @Test
+ public void testSoundexBasic3() {
+ Assert.assertEquals("734000|739400", this.getStringEncoder().soundex("Peters"));
+ Assert.assertEquals("734600|739460", this.getStringEncoder().soundex("Peterson"));
+ Assert.assertEquals("645740", this.getStringEncoder().soundex("Moskowitz"));
+ Assert.assertEquals("645740", this.getStringEncoder().soundex("Moskovitz"));
+ Assert.assertEquals("154600|145460|454600|445460", this.getStringEncoder().soundex("Jackson"));
+ Assert.assertEquals("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464", this
+ .getStringEncoder().soundex("Jackson-Jackson"));
+ }
+
+ @Test
+ public void testSpecialRomanianCharacters() {
+ Assert.assertEquals("364000|464000", this.getStringEncoder().soundex("Å£amas")); // t-cedilla
+ Assert.assertEquals("364000|464000", this.getStringEncoder().soundex("Èamas")); // t-comma
+ }
+
+}
Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java
------------------------------------------------------------------------------
svn:keywords = Id