You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/03/24 17:08:56 UTC
svn commit: r1304839 -
/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Author: mikemccand
Date: Sat Mar 24 16:08:56 2012
New Revision: 1304839
URL: http://svn.apache.org/viewvc?rev=1304839&view=rev
Log:
LUCENE-3905: if real doc's text is too big, take a random slice (not just the prefix string)
Modified:
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304839&r1=1304838&r2=1304839&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 24 16:08:56 2012
@@ -370,12 +370,19 @@ public abstract class BaseTokenStreamTes
// real data from linedocs
text = docs.nextDoc().get("body");
if (text.length() > maxWordLength) {
- // Take care not to split up a surrogate pair:
- if (Character.isHighSurrogate(text.charAt(maxWordLength-1))) {
- text = text.substring(0, maxWordLength-1);
- } else {
- text = text.substring(0, maxWordLength);
+
+ // Take a random slice from the text...:
+ int startPos = random.nextInt(text.length() - maxWordLength);
+ if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos))) {
+ // Take care not to split up a surrogate pair:
+ startPos--;
+ }
+ int endPos = startPos + maxWordLength - 1;
+ if (Character.isHighSurrogate(text.charAt(endPos))) {
+ // Take care not to split up a surrogate pair:
+ endPos--;
}
+ text = text.substring(startPos, 1+endPos);
}
} else {
// synthetic