You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2011/11/13 21:59:08 UTC
svn commit: r1201511 - in
/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm:
BeiderMorseEncoder.java Lang.java Languages.java NameType.java
PhoneticEngine.java Rule.java RuleType.java
Author: ggregory
Date: Sun Nov 13 20:59:07 2011
New Revision: 1201511
URL: http://svn.apache.org/viewvc?rev=1201511&view=rev
Log:
Apply documentation patch from Matthew Pocock. Thank you Matthew!
Modified:
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Sun Nov 13 20:59:07 2011
@@ -31,11 +31,56 @@ import org.apache.commons.codec.StringEn
* This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
* thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
* </p>
+ *
+ * <h2>Encoding overview</h2>
+ *
+ * <p>
+ * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
+ * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. Next,
+ * the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
+ * can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
+ * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
+ * this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
+ * into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
+ * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
+ * pronounced in several ways in the source language have only one way to represent them in this average phonetic
+ * language, so the result is again a set of phonetic spellings.
+ * </p>
+ *
+ * <p>
+ * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
+ * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
+ * Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
+ * ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
+ * and once without it. The resulting encoding contains one and then the other result.
+ * </p>
+ *
+ *
+ * <h2>Encoding format</h2>
+ *
+ * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
+ * are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. If multiple hyphenated
+ * words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
+ * are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible prefix. The form without prefix encodes to
+ * "<code>ortlaj|ortlej</code>", while the form with prefix encodes to "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is
+ * "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
+ *
+ * <p>
+ * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
+ * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
+ * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
+ * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
+ * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
+ * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * </p>
*
* @author Apache Software Foundation
* @since 1.6
*/
public class BeiderMorseEncoder implements StringEncoder {
+ // implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
+ // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
+
// a cached object
private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java Sun Nov 13 20:59:07 2011
@@ -71,6 +71,13 @@ import java.util.regex.Pattern;
* @since 1.6
*/
public class Lang {
+ // implementation note: This class is divided into two sections. The first part is a static factory interface that
+ // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
+ // encapsulate a particular language-guessing rule table and the language guessing itself.
+ //
+ // It may make sense in the future to expose the private constructor to allow power users to build custom language-
+ // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
+ // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
private static final class LangRule {
private final boolean acceptOnMatch;
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java Sun Nov 13 20:59:07 2011
@@ -53,6 +53,9 @@ import java.util.Set;
* @since 1.6
*/
public class Languages {
+ // implementation note: This class is divided into two sections. The first part is a static factory interface that
+ // exposes org/apache/commons/codec/language/bm/%s_languages.txt for %s in NameType.* as a list of supported
+ // languages, and a second part that provides instance methods for accessing this set fo supported languages.
/**
* A set of languages.
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java Sun Nov 13 20:59:07 2011
@@ -18,7 +18,9 @@
package org.apache.commons.codec.language.bm;
/**
- * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}.
+ * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}. The
+ * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are specifically
+ * tuned to family names, and may not work well at all for general text.
*
* @author Apache Software Foundation
* @since 1.6
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Sun Nov 13 20:59:07 2011
@@ -51,8 +51,23 @@ import java.util.TreeSet;
*/
public class PhoneticEngine {
+ /**
+ * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
+ * and probably not outside the {@link PhoneticEngine} class.
+ *
+ * @author Apache Software Foundation
+ * @since 1.6
+ */
static final class PhonemeBuilder {
+ /**
+ * An empty builder where all phonemes must come from some set of languages. This will contain a single
+ * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
+ * phoneme from scratch.
+ *
+ * @param languages the set of languages
+ * @return a new, empty phoneme builder
+ */
public static PhonemeBuilder empty(Languages.LanguageSet languages) {
return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
}
@@ -63,6 +78,12 @@ public class PhoneticEngine {
this.phonemes = phonemes;
}
+ /**
+ * Create a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
+ *
+ * @param str the characters to append to the phonemes
+ * @return a new phoneme builder lenghened by <code>str</code>
+ */
public PhonemeBuilder append(CharSequence str) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
@@ -73,6 +94,16 @@ public class PhoneticEngine {
return new PhonemeBuilder(newPhonemes);
}
+ /**
+ * Create a new phoneme builder containing the application of the expression to all phonemes in this builder.
+ *
+ * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
+ * incompatible.
+ *
+ * @param phonemeExpr the expression to apply
+ * @return a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
+ * in turn
+ */
public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
@@ -88,10 +119,22 @@ public class PhoneticEngine {
return new PhonemeBuilder(newPhonemes);
}
+ /**
+ * The underlying phoneme set. Please don't mutate.
+ *
+ * @return the phoneme set
+ */
public Set<Rule.Phoneme> getPhonemes() {
return this.phonemes;
}
+ /**
+ * Stringify the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
+ * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
+ * avoided when debugging.
+ *
+ * @return the stringified phoneme set
+ */
public String makeString() {
StringBuilder sb = new StringBuilder();
@@ -108,6 +151,17 @@ public class PhoneticEngine {
}
}
+ /**
+ * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
+ * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
+ * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
+ * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
+ * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
+ * updated by the matching rule.
+ *
+ * @author Apache Software Foundation
+ * @since 1.6
+ */
private static final class RulesApplication {
private final List<Rule> finalRules;
private final CharSequence input;
@@ -134,6 +188,13 @@ public class PhoneticEngine {
return this.phonemeBuilder;
}
+ /**
+ * This invokes the rules. It loops over the rules list, stopping at the first one that has a matching context
+ * and pattern. It then applies this rule to the phoneme builder to produce updated phonemes. If there was no
+ * match, <code>i</code> is advanced one and the character is silently dropped from the phonetic spelling.
+ *
+ * @return <code>this</code>
+ */
public RulesApplication invoke() {
this.found = false;
int patternLength = 0;
@@ -176,6 +237,12 @@ public class PhoneticEngine {
"de la", "della", "des", "di", "do", "dos", "du", "van", "von"))));
}
+ /**
+ * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
+ *
+ * @param cached the character sequence to cache
+ * @return a <code>CharSequence</code> that internally memoises subSequence values
+ */
private static CharSequence cacheSubSequence(final CharSequence cached) {
// return cached;
final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()];
@@ -203,6 +270,12 @@ public class PhoneticEngine {
};
}
+ /**
+ * Join some strings with an internal separater.
+ * @param strings Strings to join
+ * @param sep String to separate them with
+ * @return a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
+ */
private static String join(Iterable<String> strings, String sep) {
StringBuilder sb = new StringBuilder();
Iterator<String> si = strings.iterator();
@@ -244,6 +317,14 @@ public class PhoneticEngine {
this.lang = Lang.instance(nameType);
}
+ /**
+ * Apply the final rules to convert from a language-specific phonetic representation to a language-independent
+ * representation.
+ *
+ * @param phonemeBuilder
+ * @param finalRules
+ * @return
+ */
private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule> finalRules) {
if (finalRules == null) {
throw new NullPointerException("finalRules can not be null");
@@ -304,8 +385,11 @@ public class PhoneticEngine {
*/
public String encode(String input, final Languages.LanguageSet languageSet) {
final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet);
+ // rules common across many (all) languages
final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common");
+ // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet);
+
// System.err.println("Languages: " + languageSet);
// System.err.println("Rules: " + rules);
@@ -333,6 +417,7 @@ public class PhoneticEngine {
final List<String> words = Arrays.asList(input.split("\\s+"));
final List<String> words2 = new ArrayList<String>();
+ // special-case handling of word prefixes based upon the name type
switch (this.nameType) {
case SEPHARDIC:
for (String aWord : words) {
@@ -380,13 +465,10 @@ public class PhoneticEngine {
// System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
}
- // System.err.println("Applying general rules");
+ // Apply the general rules
phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
- // System.err.println("Now got: " + phonemeBuilder.makeString());
- // System.err.println("Applying language-specific rules");
+ // Apply the language-specific rules
phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);
- // System.err.println("Now got: " + phonemeBuilder.makeString());
- // System.err.println("Done");
return phonemeBuilder.makeString();
}
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java Sun Nov 13 20:59:07 2011
@@ -583,7 +583,9 @@ public class Rule {
}
/**
- * Decides if the pattern and context match the input starting at a position.
+ * Decides if the pattern and context match the input starting at a position. It is a match if the
+ * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
+ * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
*
* @param input
* the input String
@@ -604,6 +606,9 @@ public class Rule {
return false;
}
+ // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid
+ // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to
+ // read
boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern);
boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length()));
boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i));
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/RuleType.java?rev=1201511&r1=1201510&r2=1201511&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/RuleType.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/RuleType.java Sun Nov 13 20:59:07 2011
@@ -25,7 +25,12 @@ package org.apache.commons.codec.languag
*/
public enum RuleType {
- APPROX("approx"), EXACT("exact"), RULES("rules");
+ /** Approximate rules, which will lead to the largest number of phonetic interpretations. */
+ APPROX("approx"),
+ /** Exact rules, which will lead to a minimum number of phonetic interpretations. */
+ EXACT("exact"),
+ /** For internal use only. Please use {@link #APPROX} or {@link #EXACT}. */
+ RULES("rules");
private final String name;