You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2012/03/07 22:02:59 UTC
svn commit: r1298118 - in /commons/proper/codec/trunk/src: changes/
main/java/org/apache/commons/codec/language/bm/
test/java/org/apache/commons/codec/language/bm/
Author: ggregory
Date: Wed Mar 7 21:02:59 2012
New Revision: 1298118
URL: http://svn.apache.org/viewvc?rev=1298118&view=rev
Log:
[CODEC-132] BeiderMorseEncoder OOM issues
Modified:
commons/proper/codec/trunk/src/changes/changes.xml
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
Modified: commons/proper/codec/trunk/src/changes/changes.xml
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/changes/changes.xml?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/changes/changes.xml (original)
+++ commons/proper/codec/trunk/src/changes/changes.xml Wed Mar 7 21:02:59 2012
@@ -26,6 +26,9 @@
org.apache.commons.codec.net.URLCodec charset field final. </action> </release>
-->
<release version="1.6.1" date="TBD" description="Feature and fix release.">
+ <action dev="ggregory" type="fix" issue="CODEC-132" due-to="rcmuir">
+ BeiderMorseEncoder OOM issues
+ </action>
<action dev="ggregory" type="fix" issue="CODEC-121" due-to="javajohn">
QuotedPrintableCodec does not support soft line break per the 'quoted-printable' example on Wikipedia
</action>
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java Wed Mar 7 21:02:59 2012
@@ -100,7 +100,7 @@ public class BeiderMorseEncoder implemen
/**
* Gets the name type currently in operation.
- *
+ *
* @return the NameType currently being used
*/
public NameType getNameType() {
@@ -109,7 +109,7 @@ public class BeiderMorseEncoder implemen
/**
* Gets the rule type currently in operation.
- *
+ *
* @return the RuleType currently being used
*/
public RuleType getRuleType() {
@@ -118,7 +118,7 @@ public class BeiderMorseEncoder implemen
/**
* Discovers if multiple possible encodings are concatenated.
- *
+ *
* @return true if multiple encodings are concatenated, false if just the first one is returned
*/
public boolean isConcat() {
@@ -127,33 +127,55 @@ public class BeiderMorseEncoder implemen
/**
* Sets how multiple possible phonetic encodings are combined.
- *
+ *
* @param concat
* true if multiple encodings are to be combined with a '|', false if just the first one is to be considered
*/
public void setConcat(boolean concat) {
- this.engine = new PhoneticEngine(this.engine.getNameType(), this.engine.getRuleType(), concat);
+ this.engine = new PhoneticEngine(this.engine.getNameType(),
+ this.engine.getRuleType(),
+ concat,
+ this.engine.getMaxPhonemes());
}
/**
* Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phoentic encodings optimized for Ashkenazi or
* Sephardic Jewish family names.
- *
+ *
* @param nameType
* the NameType in use
*/
public void setNameType(NameType nameType) {
- this.engine = new PhoneticEngine(nameType, this.engine.getRuleType(), this.engine.isConcat());
+ this.engine = new PhoneticEngine(nameType,
+ this.engine.getRuleType(),
+ this.engine.isConcat(),
+ this.engine.getMaxPhonemes());
}
/**
* Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
- *
+ *
* @param ruleType
* {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
*/
public void setRuleType(RuleType ruleType) {
- this.engine = new PhoneticEngine(this.engine.getNameType(), ruleType, this.engine.isConcat());
+ this.engine = new PhoneticEngine(this.engine.getNameType(),
+ ruleType,
+ this.engine.isConcat(),
+ this.engine.getMaxPhonemes());
+ }
+
+ /**
+ * Sets the number of maximum of phonemes that shall be considered by the engine.
+ *
+ * @param maxPhonemes
+ * the maximum number of phonemes returned by the engine
+ */
+ public void setMaxPhonemes(int maxPhonemes) {
+ this.engine = new PhoneticEngine(this.engine.getNameType(),
+ this.engine.getRuleType(),
+ this.engine.isConcat(),
+ maxPhonemes);
}
}
Modified: commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original)
+++ commons/proper/codec/trunk/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Wed Mar 7 21:02:59 2012
@@ -101,17 +101,22 @@ public class PhoneticEngine {
* incompatible.
*
* @param phonemeExpr the expression to apply
+ * @param maxPhonemes the maximum number of phonemes to build up
* @return a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
* in turn
*/
- public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
+ public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr, int maxPhonemes) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
- for (Rule.Phoneme left : this.phonemes) {
+ EXPR: for (Rule.Phoneme left : this.phonemes) {
for (Rule.Phoneme right : phonemeExpr.getPhonemes()) {
Rule.Phoneme join = left.join(right);
if (!join.getLanguages().isEmpty()) {
- newPhonemes.add(join);
+ if (newPhonemes.size() < maxPhonemes) {
+ newPhonemes.add(join);
+ } else {
+ break EXPR;
+ }
}
}
}
@@ -168,9 +173,11 @@ public class PhoneticEngine {
private PhonemeBuilder phonemeBuilder;
private int i;
+ private int maxPhonemes;
private boolean found;
- public RulesApplication(List<Rule> finalRules, CharSequence input, PhonemeBuilder phonemeBuilder, int i) {
+ public RulesApplication(List<Rule> finalRules, CharSequence input,
+ PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) {
if (finalRules == null) {
throw new NullPointerException("The finalRules argument must not be null");
}
@@ -178,6 +185,7 @@ public class PhoneticEngine {
this.phonemeBuilder = phonemeBuilder;
this.input = input;
this.i = i;
+ this.maxPhonemes = maxPhonemes;
}
public int getI() {
@@ -208,7 +216,7 @@ public class PhoneticEngine {
continue RULES;
}
- this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme());
+ this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes);
this.found = true;
break RULES;
}
@@ -289,6 +297,8 @@ public class PhoneticEngine {
return sb.toString();
}
+ private static final int DEFAULT_MAX_PHONEMES = 20;
+
private final Lang lang;
private final NameType nameType;
@@ -297,9 +307,11 @@ public class PhoneticEngine {
private final boolean concat;
+ private final int maxPhonemes;
+
/**
* Generates a new, fully-configured phonetic engine.
- *
+ *
* @param nameType
* the type of names it will use
* @param ruleType
@@ -308,6 +320,22 @@ public class PhoneticEngine {
* if it will concatenate multiple encodings
*/
public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) {
+ this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES);
+ }
+
+ /**
+ * Generates a new, fully-configured phonetic engine.
+ *
+ * @param nameType
+ * the type of names it will use
+ * @param ruleType
+ * the type of rules it will apply
+ * @param concat
+ * if it will concatenate multiple encodings
+ * @param maxPhonemes
+ * the maximum number of phonemes that will be handled
+ */
+ public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat, int maxPhonemes) {
if (ruleType == RuleType.RULES) {
throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES);
}
@@ -315,6 +343,7 @@ public class PhoneticEngine {
this.ruleType = ruleType;
this.concat = concat;
this.lang = Lang.instance(nameType);
+ this.maxPhonemes = maxPhonemes;
}
/**
@@ -341,7 +370,8 @@ public class PhoneticEngine {
// System.err.println("Expanding: " + phonemeText);
for (int i = 0; i < phonemeText.length();) {
- RulesApplication rulesApplication = new RulesApplication(finalRules, phonemeText, subBuilder, i).invoke();
+ RulesApplication rulesApplication =
+ new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke();
boolean found = rulesApplication.isFound();
subBuilder = rulesApplication.getPhonemeBuilder();
@@ -459,7 +489,8 @@ public class PhoneticEngine {
// loop over each char in the input - we will handle the increment manually
CharSequence inputCache = cacheSubSequence(input);
for (int i = 0; i < inputCache.length();) {
- RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke();
+ RulesApplication rulesApplication =
+ new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke();
i = rulesApplication.getI();
phonemeBuilder = rulesApplication.getPhonemeBuilder();
// System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
@@ -508,4 +539,13 @@ public class PhoneticEngine {
public boolean isConcat() {
return this.concat;
}
+
+ /**
+ * Gets the maximum number of phonemes the engine will calculate for a given input.
+ *
+ * @return the maximum number of phonemes
+ */
+ public int getMaxPhonemes() {
+ return this.maxPhonemes;
+ }
}
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java Wed Mar 7 21:02:59 2012
@@ -19,6 +19,7 @@ package org.apache.commons.codec.languag
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
@@ -60,7 +61,7 @@ public class BeiderMorseEncoderTest exte
public void testAllChars() throws EncoderException {
BeiderMorseEncoder bmpm = createGenericApproxEncoder();
for (char c = Character.MIN_VALUE; c < Character.MAX_VALUE; c++) {
- bmpm.encode("" + c);
+ bmpm.encode(Character.toString(c));
}
}
@@ -68,7 +69,7 @@ public class BeiderMorseEncoderTest exte
public void testAsciiEncodeNotEmpty1Letter() throws EncoderException {
BeiderMorseEncoder bmpm = createGenericApproxEncoder();
for (char c = 'a'; c <= 'z'; c++) {
- final String value = "" + c;
+ final String value = Character.toString(c);
final String valueU = value.toUpperCase();
assertNotEmpty(bmpm, value);
assertNotEmpty(bmpm, valueU);
@@ -138,6 +139,24 @@ public class BeiderMorseEncoderTest exte
}
@Test
+ public void testOOM() throws EncoderException {
+ String phrase = "200697900'-->�</ bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae"
+ + "cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" /></ afe >"
+ + "<script><!-- f(';< cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >";
+
+ BeiderMorseEncoder encoder = new BeiderMorseEncoder();
+ encoder.setNameType(NameType.GENERIC);
+ encoder.setRuleType(RuleType.EXACT);
+ encoder.setMaxPhonemes(10);
+
+ String phonemes = encoder.encode(phrase);
+ assertTrue(phonemes.length() > 0);
+
+ String[] phonemeArr = phonemes.split("\\|");
+ assertTrue(phonemeArr.length <= 10);
+ }
+
+ @Test
public void testSetConcat() {
BeiderMorseEncoder bmpm = new BeiderMorseEncoder();
bmpm.setConcat(false);
Modified: commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1298118&r1=1298117&r2=1298118&view=diff
==============================================================================
--- commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java (original)
+++ commons/proper/codec/trunk/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Wed Mar 7 21:02:59 2012
@@ -18,6 +18,7 @@
package org.apache.commons.codec.language.bm;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.List;
@@ -38,22 +39,18 @@ public class PhoneticEngineTest {
@Parameterized.Parameters
public static List<Object[]> data() {
return Arrays
- .asList(new Object[] {
- "Renault",
- "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult",
- NameType.GENERIC,
- RuleType.APPROX,
- true },
- new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true },
- new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true },
- new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true },
- new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true },
+ .asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, true, 10 },
+ new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true, 10 },
+ new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, true, 1 },
+ new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true, 10 },
+ new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true, 10 },
+ new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true, 10 },
new Object[] {
"van helsing",
"(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)",
NameType.GENERIC,
RuleType.EXACT,
- false });
+ false, 10 });
}
private final boolean concat;
@@ -61,23 +58,37 @@ public class PhoneticEngineTest {
private final NameType nameType;
private final String phoneticExpected;
private final RuleType ruleType;
+ private final int maxPhonemes;
- public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType, RuleType ruleType, boolean concat) {
+ public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType,
+ RuleType ruleType, boolean concat, int maxPhonemes) {
this.name = name;
this.phoneticExpected = phoneticExpected;
this.nameType = nameType;
this.ruleType = ruleType;
this.concat = concat;
+ this.maxPhonemes = maxPhonemes;
}
@Test(timeout = 10000L)
public void testEncode() {
- PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat);
+ PhoneticEngine engine = new PhoneticEngine(this.nameType, this.ruleType, this.concat, this.maxPhonemes);
String phoneticActual = engine.encode(this.name);
//System.err.println("expecting: " + this.phoneticExpected);
//System.err.println("actual: " + phoneticActual);
assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
+
+ if (this.concat) {
+ String[] split = phoneticActual.split("\\|");
+ assertTrue(split.length <= this.maxPhonemes);
+ } else {
+ String[] words = phoneticActual.split("-");
+ for (String word : words) {
+ String[] split = word.split("\\|");
+ assertTrue(split.length <= this.maxPhonemes);
+ }
+ }
}
}