You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/03/24 15:06:51 UTC
svn commit: r1304799 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Author: mikemccand
Date: Sat Mar 24 14:06:51 2012
New Revision: 1304799
URL: http://svn.apache.org/viewvc?rev=1304799&view=rev
Log:
LUCENE-3905: don't split up a surrogate pair when truncating too-long text
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304799&r1=1304798&r2=1304799&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 24 14:06:51 2012
@@ -369,7 +369,12 @@ public abstract class BaseTokenStreamTes
if (random.nextInt(10) == 7) {
text = docs.nextDoc().get("body");
if (text.length() > maxWordLength) {
- text = text.substring(0, maxWordLength);
+ // Take care not to split up a surrogate pair:
+ if (Character.isHighSurrogate(text.charAt(maxWordLength-1))) {
+ text = text.substring(0, maxWordLength-1);
+ } else {
+ text = text.substring(0, maxWordLength);
+ }
}
} else {
if (simple) {