You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2012/03/07 22:02:59 UTC

svn commit: r1298118 - in /commons/proper/codec/trunk/src: changes/ main/java/org/apache/commons/codec/language/bm/ test/java/org/apache/commons/codec/language/bm/

Author: ggregory
Date: Wed Mar  7 21:02:59 2012
New Revision: 1298118

URL: http://svn.apache.org/viewvc?rev=1298118&view=rev
Log:
[CODEC-132] BeiderMorseEncoder OOM issues

Modified:
    commons/proper/codec/trunk/src/changes/changes.xml
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
    commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
    commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java

Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Wed Mar  7 21:02:59 2012
@@ -26,6 +26,9 @@
       org.apache.commons.codec.net.URLCodec charset field final. </action>   </release>
     -->
     <release version="1.6.1" date="TBD" description="Feature and fix release.">
+      <action dev="ggregory" type="fix" issue="CODEC-132" due-to="rcmuir">
+        BeiderMorseEncoder OOM issues
+      </action>
       <action dev="ggregory" type="fix" issue="CODEC-121" due-to="javajohn">
         QuotedPrintableCodec does not support soft line break per the 'quoted-printable' example on Wikipedia
       </action>

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Wed Mar  7 21:02:59 2012
@@ -100,7 +100,7 @@ public class BeiderMorseEncoder implemen
 
     /**
      * Gets the name type currently in operation.
-     * 
+     *
      * @return the NameType currently being used
      */
     public NameType getNameType() {
@@ -109,7 +109,7 @@ public class BeiderMorseEncoder implemen
 
     /**
      * Gets the rule type currently in operation.
-     * 
+     *
      * @return the RuleType currently being used
      */
     public RuleType getRuleType() {
@@ -118,7 +118,7 @@ public class BeiderMorseEncoder implemen
 
     /**
      * Discovers if multiple possible encodings are concatenated.
-     * 
+     *
      * @return true if multiple encodings are concatenated, false if just the first one is returned
      */
     public boolean isConcat() {
@@ -127,33 +127,55 @@ public class BeiderMorseEncoder implemen
 
     /**
      * Sets how multiple possible phonetic encodings are combined.
-     * 
+     *
      * @param concat
      *            true if multiple encodings are to be combined with a '|', false if just the first one is to be considered
      */
     public void setConcat(boolean concat) {
-        this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat);
+        this.engine = new PhoneticEngine(this.engine.getNameType(),
+                                         this.engine.getRuleType(),
+                                         concat,
+                                         this.engine.getMaxPhonemes());
     }
 
     /**
      * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or
      * Sephardic Jewish family names.
-     * 
+     *
      * @param nameType
      *            the NameType in use
      */
     public void setNameType(NameType nameType) {
-        this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat());
+        this.engine = new PhoneticEngine(nameType,
+                                         this.engine.getRuleType(),
+                                         this.engine.isConcat(),
+                                         this.engine.getMaxPhonemes());
     }
 
     /**
      * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
-     * 
+     *
      * @param ruleType
      *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
      */
     public void setRuleType(RuleType ruleType) {
-        this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat());
+        this.engine = new PhoneticEngine(this.engine.getNameType(),
+                                         ruleType,
+                                         this.engine.isConcat(),
+                                         this.engine.getMaxPhonemes());
+    }
+
+    /**
+     * Sets the number of maximum of phonemes that shall be considered by the engine.
+     *
+     * @param maxPhonemes
+     *            the maximum number of phonemes returned by the engine
+     */
+    public void setMaxPhonemes(int maxPhonemes) {
+        this.engine = new PhoneticEngine(this.engine.getNameType(),
+                                         this.engine.getRuleType(),
+                                         this.engine.isConcat(),
+                                         maxPhonemes);
     }
 
 }

Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Wed Mar  7 21:02:59 2012
@@ -101,17 +101,22 @@ public class PhoneticEngine {
          * incompatible.
          *
          * @param phonemeExpr   the expression to apply
+         * @param maxPhonemes   the maximum number of phonemes to build up
          * @return  a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
          *      in turn
          */
-        public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
+        public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr, int maxPhonemes) {
             Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
 
-            for (Rule.Phoneme left : this.phonemes) {
+            EXPR: for (Rule.Phoneme left : this.phonemes) {
                 for (Rule.Phoneme right : phonemeExpr.getPhonemes()) {
                     Rule.Phoneme join = left.join(right);
                     if (!join.getLanguages().isEmpty()) {
-                        newPhonemes.add(join);
+                        if (newPhonemes.size() < maxPhonemes) {
+                            newPhonemes.add(join);
+                        } else {
+                            break EXPR;
+                        }
                     }
                 }
             }
@@ -168,9 +173,11 @@ public class PhoneticEngine {
 
         private PhonemeBuilder phonemeBuilder;
         private int i;
+        private int maxPhonemes;
         private boolean found;
 
-        public RulesApplication(List<Rule> finalRules, CharSequence input, PhonemeBuilder phonemeBuilder, int i) {
+        public RulesApplication(List<Rule> finalRules, CharSequence input,
+                                PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) {
             if (finalRules == null) {
                 throw new NullPointerException("The finalRules argument must not be null");
             }
@@ -178,6 +185,7 @@ public class PhoneticEngine {
             this.phonemeBuilder = phonemeBuilder;
             this.input = input;
             this.i = i;
+            this.maxPhonemes = maxPhonemes;
         }
 
         public int getI() {
@@ -208,7 +216,7 @@ public class PhoneticEngine {
                     continue RULES;
                 }
 
-                this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme());
+                this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
                 this.found = true;
                 break RULES;
             }
@@ -289,6 +297,8 @@ public class PhoneticEngine {
         return sb.toString();
     }
 
+    private static final int DEFAULT_MAX_PHONEMES = 20;
+
     private final Lang lang;
 
     private final NameType nameType;
@@ -297,9 +307,11 @@ public class PhoneticEngine {
 
     private final boolean concat;
 
+    private final int maxPhonemes;
+
     /**
      * Generates a new, fully-configured phonetic engine.
-     * 
+     *
      * @param nameType
      *            the type of names it will use
      * @param ruleType
@@ -308,6 +320,22 @@ public class PhoneticEngine {
      *            if it will concatenate multiple encodings
      */
     public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) {
+        this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES);
+    }
+
+    /**
+     * Generates a new, fully-configured phonetic engine.
+     *
+     * @param nameType
+     *            the type of names it will use
+     * @param ruleType
+     *            the type of rules it will apply
+     * @param concat
+     *            if it will concatenate multiple encodings
+     * @param maxPhonemes
+     *            the maximum number of phonemes that will be handled
+     */
+    public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat, int maxPhonemes) {
         if (ruleType == RuleType.RULES) {
             throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES);
         }
@@ -315,6 +343,7 @@ public class PhoneticEngine {
         this.ruleType = ruleType;
         this.concat = concat;
         this.lang = Lang.instance(nameType);
+        this.maxPhonemes = maxPhonemes;
     }
 
     /**
@@ -341,7 +370,8 @@ public class PhoneticEngine {
             // System.err.println("Expanding: " + phonemeText);
 
             for (int i = 0; i < phonemeText.length();) {
-                RulesApplication rulesApplication = new RulesApplication(finalRules, phonemeText, subBuilder, i).invoke();
+                RulesApplication rulesApplication =
+                        new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke();
                 boolean found = rulesApplication.isFound();
                 subBuilder = rulesApplication.getPhonemeBuilder();
 
@@ -459,7 +489,8 @@ public class PhoneticEngine {
         // loop over each char in the input - we will handle the increment manually
         CharSequence inputCache = cacheSubSequence(input);
         for (int i = 0; i < inputCache.length();) {
-            RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke();
+            RulesApplication rulesApplication =
+                    new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke();
             i = rulesApplication.getI();
             phonemeBuilder = rulesApplication.getPhonemeBuilder();
             // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
@@ -508,4 +539,13 @@ public class PhoneticEngine {
     public boolean isConcat() {
         return this.concat;
     }
+
+    /**
+     * Gets the maximum number of phonemes the engine will calculate for a given input.
+     *
+     * @return the maximum number of phonemes
+     */
+    public int getMaxPhonemes() {
+        return this.maxPhonemes;
+    }
 }

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java Wed Mar  7 21:02:59 2012
@@ -19,6 +19,7 @@ package org.apache.commons.codec.languag
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 import org.apache.commons.codec.EncoderException;
 import org.apache.commons.codec.StringEncoder;
@@ -60,7 +61,7 @@ public class BeiderMorseEncoderTest exte
     public void testAllChars() throws EncoderException {
         BeiderMorseEncoder bmpm = createGenericApproxEncoder();
         for (char c = Character.MIN_VALUE; c < Character.MAX_VALUE; c++) {
-            bmpm.encode("" + c);
+            bmpm.encode(Character.toString(c));
         }
     }
 
@@ -68,7 +69,7 @@ public class BeiderMorseEncoderTest exte
     public void testAsciiEncodeNotEmpty1Letter() throws EncoderException {
         BeiderMorseEncoder bmpm = createGenericApproxEncoder();
         for (char c = 'a'; c <= 'z'; c++) {
-            final String value = "" + c;
+            final String value = Character.toString(c);
             final String valueU = value.toUpperCase();
             assertNotEmpty(bmpm, value);
             assertNotEmpty(bmpm, valueU);
@@ -138,6 +139,24 @@ public class BeiderMorseEncoderTest exte
     }
 
     @Test
+    public void testOOM() throws EncoderException {
+        String phrase = "200697900'-->&#1913348150;</  bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae"
+                + "cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo<    cc =\"abff\"    /></   afe  >"
+                + "<script><!-- f(';<    cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >";
+
+        BeiderMorseEncoder encoder = new BeiderMorseEncoder();
+        encoder.setNameType(NameType.GENERIC);
+        encoder.setRuleType(RuleType.EXACT);
+        encoder.setMaxPhonemes(10);
+
+        String phonemes = encoder.encode(phrase);
+        assertTrue(phonemes.length() > 0);
+
+        String[] phonemeArr = phonemes.split("\\|");
+        assertTrue(phonemeArr.length <= 10);
+    }
+
+    @Test
     public void testSetConcat() {
         BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
         bmpm.setConcat(false);

Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Wed Mar  7 21:02:59 2012
@@ -18,6 +18,7 @@
 package org.apache.commons.codec.language.bm;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.util.Arrays;
 import java.util.List;
@@ -38,22 +39,18 @@ public class PhoneticEngineTest {
     @Parameterized.Parameters
     public static List<Object[]> data() {
         return Arrays
-                .asList(new Object[] {
-                        "Renault",
-                        "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult",
-                        NameType.GENERIC,
-                        RuleType.APPROX,
-                        true },
-                        new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true },
-                        new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true },
-                        new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true },
-                        new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true },
+                .asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, true, 10 },
+                        new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true, 10 },
+                        new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, true, 1 },
+                        new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true, 10 },
+                        new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true, 10 },
+                        new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true, 10 },
                         new Object[] {
                                 "van helsing",
                                 "(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)",
                                 NameType.GENERIC,
                                 RuleType.EXACT,
-                                false });
+                                false, 10 });
     }
 
     private final boolean concat;
@@ -61,23 +58,37 @@ public class PhoneticEngineTest {
     private final NameType nameType;
     private final String phoneticExpected;
     private final RuleType ruleType;
+    private final int maxPhonemes;
 
-    public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType, RuleType ruleType, boolean concat) {
+    public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType,
+                              RuleType ruleType, boolean concat, int maxPhonemes) {
         this.name = name;
         this.phoneticExpected = phoneticExpected;
         this.nameType = nameType;
         this.ruleType = ruleType;
         this.concat = concat;
+        this.maxPhonemes = maxPhonemes;
     }
 
     @Test(timeout = 10000L)
     public void testEncode() {
-        PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat);
+        PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat, this.maxPhonemes);
 
         String phoneticActual = engine.encode(this.name);
 
         //System.err.println("expecting: " + this.phoneticExpected);
         //System.err.println("actual:    " + phoneticActual);
         assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
+
+        if (this.concat) {
+            String[] split = phoneticActual.split("\\|");
+            assertTrue(split.length <= this.maxPhonemes);
+        } else {
+            String[] words = phoneticActual.split("-");
+            for (String word : words) {
+                String[] split = word.split("\\|");
+                assertTrue(split.length <= this.maxPhonemes);
+            }
+        }
     }
 }