You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2012/08/23 22:01:31 UTC

svn commit: r1376668 - in /commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm: BeiderMorseEncoder.java Lang.java Languages.java NameType.java PhoneticEngine.java Rule.java

Author: tn
Date: Thu Aug 23 20:01:30 2012
New Revision: 1376668

URL: http://svn.apache.org/viewvc?rev=1376668&view=rev
Log:
Javadoc fixes for Beider-Morse Encoder / Phonetic engine, removed debug prints.

Modified:
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Thu Aug 23 20:01:30 2012
@@ -21,50 +21,41 @@ import org.apache.commons.codec.EncoderE
 import org.apache.commons.codec.StringEncoder;
 
 /**
- * <p>
  * Encodes strings into their Beider-Morse phonetic encoding.
- * </p>
  * <p>
  * Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range of words.
- * </p>
  * <p>
- * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
- * thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
- * </p>
- *
- * <h2>Encoding overview</h2>
- *
+ * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it
+ * is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use
+ * {@link PhoneticEngine} directly.
  * <p>
- * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
- * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. Next,
- * the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
- * can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
- * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
- * this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
- * into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
- * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
- * pronounced in several ways in the source language have only one way to represent them in this average phonetic
- * language, so the result is again a set of phonetic spellings.
- * </p>
- *
+ * <b>Encoding overview</b>
  * <p>
- * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
- * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
- * Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
- * ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
- * and once without it. The resulting encoding contains one and then the other result.
- * </p>
- *
- *
- * <h2>Encoding format</h2>
- *
- * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
- * are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. If multiple hyphenated
- * words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
- * are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible prefix. The form without prefix encodes to
- * "<code>ortlaj|ortlej</code>", while the form with prefix encodes to "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is
- * "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
- *
+ * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
+ * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French.
+ * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some
+ * runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up
+ * into phonemes at different places, so this stage results in a set of possible language-specific phonetic
+ * representations. Lastly, this language-specific phonetic representation is processed by a table of rules that
+ * re-writes it phonetically taking into account systematic pronunciation differences between languages, to move
+ * it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be
+ * done and sometimes things that can be pronounced in several ways in the source language have only one way to
+ * represent them in this average phonetic language, so the result is again a set of phonetic spellings.
+ * <p>
+ * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated.
+ * In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final
+ * encoding. Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English)
+ * names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word
+ * is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result.
+ * <p>
+ * <b>Encoding format</b>
+ * <p>
+ * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where
+ * there are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character.
+ * If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed
+ * in elipses and these blocks are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible
+ * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>", while the form with prefix encodes to
+ * "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
  * <p>
  * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
  * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
@@ -72,7 +63,6 @@ import org.apache.commons.codec.StringEn
  * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
  * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
  * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
- * </p>
  *
  * @since 1.6
  */
@@ -140,8 +130,8 @@ public class BeiderMorseEncoder implemen
     }
 
     /**
-     * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or
-     * Sephardic Jewish family names.
+     * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
+     * optimized for Ashkenazi or Sephardic Jewish family names.
      *
      * @param nameType
      *            the NameType in use

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Lang.java Thu Aug 23 20:01:30 2012
@@ -31,40 +31,38 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 /**
- * <p>
  * Language guessing utility.
- * </p>
  * <p>
- * This class encapsulates rules used to guess the possible languages that a word originates from. This is done by reference to a whole
- * series of rules distributed in resource files.
- * </p>
- * <p>
- * Instances of this class are typically managed through the static factory method instance(). Unless you are developing your own language
- * guessing rules, you will not need to interact with this class directly.
- * </p>
+ * This class encapsulates rules used to guess the possible languages that a word originates from. This is
+ * done by reference to a whole series of rules distributed in resource files.
+ * <p>
+ * Instances of this class are typically managed through the static factory method instance().
+ * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
  * <p>
  * This class is intended to be immutable and thread-safe.
- * </p>
- * <h2>Lang resources</h2
  * <p>
- * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named
- * following the pattern: <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> The format of these resources is the
- * following:
- * </p>
+ * <b>Lang resources</b>
+ * <p>
+ * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
+ * They are systematically named following the pattern:
+ * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
+ * The format of these resources is the following:
  * <ul>
- * <li><b>Rules:</b> whitespace separated strings. There should be 3 columns to each row, and these will be interpreted as:
+ * <li><b>Rules:</b> whitespace separated strings.
+ * There should be 3 columns to each row, and these will be interpreted as:
  * <ol>
  * <li>pattern: a regular expression.</li>
  * <li>languages: a '+'-separated list of languages.</li>
  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
  * </ol>
  * </li>
- * <li><b>End-of-line comments:</b> Any occurance of '//' will cause all text following on that line to be discarded as a comment.</li>
- * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a
- * line ending in '*' and '/' is found.</li>
+ * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
+ * discarded as a comment.</li>
+ * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
+ * This will skip all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
- * <p/>
+ * <p>
  * Port of lang.php
  *
  * @since 1.6
@@ -116,13 +114,10 @@ public class Lang {
     }
 
     /**
-     * <p>
      * Loads language rules from a resource.
-     * </p>
      * <p>
-     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. You will only need to call this
-     * yourself if you are developing custom language mapping rules.
-     * </p>
+     * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
+     * You will only need to call this yourself if you are developing custom language mapping rules.
      *
      * @param languageRulesResourceName
      *            the fully-qualified resource name to load
@@ -157,7 +152,6 @@ public class Lang {
                     // discard comments
                     int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
-                        // System.err.println("index of comment: " + cmtI);
                         line = line.substring(0, cmtI);
                     }
 
@@ -170,11 +164,10 @@ public class Lang {
 
                     // split it up
                     String[] parts = line.split("\\s+");
-                    // System.err.println("part count: " + parts.length);
 
                     if (parts.length != 3) {
-                        // fixme: we really need to log this somewhere
-                        System.err.println("Warning: malformed line '" + rawLine + "'");
+                        // FIXME: consider throwing an IllegalStateException like in Rule
+                        // System.err.println("Warning: malformed line '" + rawLine + "'");
                         continue;
                     }
 
@@ -219,22 +212,15 @@ public class Lang {
      */
     public Languages.LanguageSet guessLanguages(String input) {
         String text = input.toLowerCase(Locale.ENGLISH);
-        // System.out.println("Testing text: '" + text + "'");
 
         Set<String> langs = new HashSet<String>(this.languages.getLanguages());
         for (LangRule rule : this.rules) {
             if (rule.matches(text)) {
-                // System.out.println("Rule " + rule.pattern + " matches " + text);
                 if (rule.acceptOnMatch) {
-                    // System.out.println("Retaining " + rule.languages);
                     langs.retainAll(rule.languages);
                 } else {
-                    // System.out.println("Removing " + rule.languages);
                     langs.removeAll(rule.languages);
                 }
-                // System.out.println("Current languages: " + langs);
-            } else {
-                // System.out.println("Rule " + rule.pattern + " does not match " + text);
             }
         }
 

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Languages.java Thu Aug 23 20:01:30 2012
@@ -27,17 +27,13 @@ import java.util.Scanner;
 import java.util.Set;
 
 /**
- * <p>
  * Language codes.
- * </p>
  * <p>
- * Language codes are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the
- * pattern:
- * </p>
+ * Language codes are typically loaded from resource files. These are UTF-8 encoded text files. They are
+ * systematically named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/${{@link NameType#getName()} languages.txt</blockquote>
  * <p>
  * The format of these resources is the following:
- * </p>
  * <ul>
  * <li><b>Language:</b> a single string containing no whitespace</li>
  * <li><b>End-of-line comments:</b> Any occurance of '//' will cause all text following on that line to be discarded as a comment.</li>
@@ -47,7 +43,6 @@ import java.util.Set;
  * </ul>
  * <p>
  * Ported from language.php
- * </p>
  *
  * @since 1.6
  *

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/NameType.java Thu Aug 23 20:01:30 2012
@@ -19,8 +19,8 @@ package org.apache.commons.codec.languag
 
 /**
  * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}. The
- * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are specifically
- * tuned to family names, and may not work well at all for general text.
+ * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are
+ * specifically tuned to family names, and may not work well at all for general text.
  *
  * @since 1.6
  */

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Thu Aug 23 20:01:30 2012
@@ -31,29 +31,25 @@ import java.util.Set;
 import java.util.TreeSet;
 
 /**
- * <p>
  * Converts words into potential phonetic representations.
- * </p>
  * <p>
- * This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes into account the likely source
- * language. Next, this phonetic representation is converted into a pan-european 'average' representation, allowing comparison between
- * different versions of essentially the same word from different languages.
- * </p>
+ * This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes
+ * into account the likely source language. Next, this phonetic representation is converted into a
+ * pan-european 'average' representation, allowing comparison between different versions of essentially
+ * the same word from different languages.
  * <p>
- * This class is intentionally immutable. If you wish to alter the settings for a PhoneticEngine, you must make a new one with the updated
- * settings. This makes the class thread-safe.
- * </p>
+ * This class is intentionally immutable. If you wish to alter the settings for a PhoneticEngine, you
+ * must make a new one with the updated settings. This makes the class thread-safe.
  * <p>
  * Ported from phoneticengine.php
- * </p>
  *
  * @since 1.6
  */
 public class PhoneticEngine {
 
     /**
-     * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
-     * and probably not outside the {@link PhoneticEngine} class.
+     * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside
+     * this package, and probably not outside the {@link PhoneticEngine} class.
      *
      * @since 1.6
      */
@@ -95,7 +91,7 @@ public class PhoneticEngine {
 
         /**
          * Creates a new phoneme builder containing the application of the expression to all phonemes in this builder.
-         *
+         * <p>
          * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
          * incompatible.
          *
@@ -133,9 +129,9 @@ public class PhoneticEngine {
         }
 
         /**
-         * Stringifies the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
-         * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
-         * avoided when debugging.
+         * Stringifies the phoneme set. This produces a single string of the strings of each phoneme,
+         * joined with a pipe. This is explicitly provided in place of toString as it is a potentially
+         * expensive operation, which should be avoided when debugging.
          *
          * @return  the stringified phoneme set
          */
@@ -160,7 +156,7 @@ public class PhoneticEngine {
      * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
      * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
      * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
-     * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
+     * matching rule was found, <code>phonemeBuilder</code> is replaced with a new builder containing the phonemes
      * updated by the matching rule.
      *
      * Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads
@@ -206,19 +202,17 @@ public class PhoneticEngine {
         public RulesApplication invoke() {
             this.found = false;
             int patternLength = 0;
-            RULES: for (Rule rule : this.finalRules) {
+            for (Rule rule : this.finalRules) {
                 String pattern = rule.getPattern();
                 patternLength = pattern.length();
-                // log("trying pattern: " + pattern);
 
                 if (!rule.patternAndContextMatches(this.input, this.i)) {
-                    // log("no match");
-                    continue RULES;
+                    continue;
                 }
 
                 this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
                 this.found = true;
-                break RULES;
+                break;
             }
 
             if (!this.found) {
@@ -249,7 +243,7 @@ public class PhoneticEngine {
      * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
      *
      * @param cached the character sequence to cache
-     * @return a <code>CharSequence</code> that internally memoises subSequence values
+     * @return a <code>CharSequence</code> that internally caches subSequence values
      */
     private static CharSequence cacheSubSequence(final CharSequence cached) {
         // return cached;
@@ -285,7 +279,7 @@ public class PhoneticEngine {
      * Joins some strings with an internal separator.
      * @param strings   Strings to join
      * @param sep       String to separate them with
-     * @return          a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
+     * @return          a single String consisting of each element of <code>strings</code> interleaved by <code>sep</code>
      */
     private static String join(Iterable<String> strings, String sep) {
         StringBuilder sb = new StringBuilder();
@@ -351,8 +345,8 @@ public class PhoneticEngine {
     }
 
     /**
-     * Applies the final rules to convert from a language-specific phonetic representation to a language-independent
-     * representation.
+     * Applies the final rules to convert from a language-specific phonetic representation to a
+     * language-independent representation.
      *
      * @param phonemeBuilder
      * @param finalRules
@@ -371,7 +365,6 @@ public class PhoneticEngine {
         for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) {
             PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages());
             CharSequence phonemeText = cacheSubSequence(phoneme.getPhonemeText());
-            // System.err.println("Expanding: " + phonemeText);
 
             for (int i = 0; i < phonemeText.length();) {
                 RulesApplication rulesApplication =
@@ -380,17 +373,13 @@ public class PhoneticEngine {
                 subBuilder = rulesApplication.getPhonemeBuilder();
 
                 if (!found) {
-                    // System.err.println("Not found. Appending as-is");
+                    // not found, appending as-is
                     subBuilder = subBuilder.append(phonemeText.subSequence(i, i + 1));
                 }
 
                 i = rulesApplication.getI();
-
-                // System.err.println(phonemeText + " " + i + ": " + subBuilder.makeString());
             }
 
-            // System.err.println("Expanded to: " + subBuilder.makeString());
-            // System.err.println("phenomes in collection of type: " + subBuilder.getPhonemes().getClass());
             phonemes.addAll(subBuilder.getPhonemes());
         }
 
@@ -424,9 +413,6 @@ public class PhoneticEngine {
         // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
         final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet);
 
-        // System.err.println("Languages: " + languageSet);
-        // System.err.println("Rules: " + rules);
-
         // tidy the input
         // lower case is a locale-dependent operation
         input = input.toLowerCase(Locale.ENGLISH).replace('-', ' ').trim();
@@ -497,7 +483,6 @@ public class PhoneticEngine {
                     new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke();
             i = rulesApplication.getI();
             phonemeBuilder = rulesApplication.getPhonemeBuilder();
-            // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
         }
 
         // Apply the general rules
@@ -538,7 +523,7 @@ public class PhoneticEngine {
     /**
      * Gets if multiple phonetic encodings are concatenated or if just the first one is kept.
      *
-     * @return true if multiple phonetic encodings are returned, false if just the first is.
+     * @return true if multiple phonetic encodings are returned, false if just the first is
      */
     public boolean isConcat() {
         return this.concat;

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java?rev=1376668&r1=1376667&r2=1376668&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/Rule.java Thu Aug 23 20:01:30 2012
@@ -33,12 +33,10 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
- * <p>
  * A phoneme rule.
- * </p>
  * <p>
- * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply and a logical flag indicating if
- * all lanugages must be in play. A rule matches if:
+ * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
+ * and a logical flag indicating if all languages must be in play. A rule matches if:
  * <ul>
  * <li>the pattern matches at the current position</li>
  * <li>the string up until the beginning of the pattern matches the left context</li>
@@ -46,22 +44,22 @@ import java.util.regex.Pattern;
  * <li>logical is ALL and all languages are in scope; or</li>
  * <li>logical is any other value and at least one language is in scope</li>
  * </ul>
- * </p>
  * <p>
- * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user to explicitly construct their
- * own.
- * </p>
+ * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
+ * to explicitly construct their own.
  * <p>
  * Rules are immutable and thread-safe.
- * <h2>Rules resources</h2>
  * <p>
- * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named following the pattern:
+ * <b>Rules resources</b>
+ * <p>
+ * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
+ * named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
- * </p>
  * <p>
  * The format of these resources is the following:
  * <ul>
- * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these will be interpreted as:
+ * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
+ * will be interpreted as:
  * <ol>
  * <li>pattern</li>
  * <li>left context</li>
@@ -69,12 +67,12 @@ import java.util.regex.Pattern;
  * <li>phoneme</li>
  * </ol>
  * </li>
- * <li><b>End-of-line comments:</b> Any occurance of '//' will cause all text following on that line to be discarded as a comment.</li>
- * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a
- * line ending in '*' and '/' is found.</li>
+ * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
+ * as a comment.</li>
+ * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
+ * all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
- * </p>
  *
  * @since 1.6
  */
@@ -355,7 +353,8 @@ public class Rule {
                         // include statement
                         String incl = line.substring(HASH_INCLUDE.length()).trim();
                         if (incl.contains(" ")) {
-                            System.err.println("Warining: malformed import statement: " + rawLine);
+                            // FIXME: consider throwing an IllegalStateException like in parsePhonemeExpr
+                            // System.err.println("Warning: malformed import statement: " + rawLine);
                         } else {
                             lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
                         }
@@ -363,7 +362,8 @@ public class Rule {
                         // rule
                         String[] parts = line.split("\\s+");
                         if (parts.length != 4) {
-                            System.err.println("Warning: malformed rule statement split into " + parts.length + " parts: " + rawLine);
+                            // FIXME: consider throwing an IllegalStateException like in parsePhonemeExpr
+                            // System.err.println("Warning: malformed rule statement split into " + parts.length + " parts: " + rawLine);
                         } else {
                             try {
                                 String pat = stripQuotes(parts[0]);
@@ -494,7 +494,6 @@ public class Rule {
             }
         }
 
-        // System.out.println("Couldn't optimize regex: " + regex);
         return new RPattern() {
             Pattern pattern = Pattern.compile(regex);