You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/24 16:06:42 UTC

svn commit: r1304821 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java

Author: rmuir
Date: Sat Mar 24 15:06:41 2012
New Revision: 1304821

URL: http://svn.apache.org/viewvc?rev=1304821&view=rev
Log:
LUCENE-3911: improve BaseTokenStreamTestCase random string generation, fix off-by-ones in TestUtil string gen methods

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304821&r1=1304820&r2=1304821&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 24 15:06:41 2012
@@ -365,8 +365,9 @@ public abstract class BaseTokenStreamTes
 
     for (int i = 0; i < iterations; i++) {
       String text;
-
+      
       if (random.nextInt(10) == 7) {
+        // real data from linedocs
         text = docs.nextDoc().get("body");
         if (text.length() > maxWordLength) {
           // Take care not to split up a surrogate pair:
@@ -377,25 +378,11 @@ public abstract class BaseTokenStreamTes
           }
         }
       } else {
-        if (simple) { 
-          text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
-        } else {
-          switch(_TestUtil.nextInt(random, 0, 4)) {
-          case 0: 
-            text = _TestUtil.randomSimpleString(random, maxWordLength);
-            break;
-          case 1:
-            text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
-            break;
-          case 2:
-            text = _TestUtil.randomHtmlishString(random, maxWordLength);
-            break;
-          default:
-            text = _TestUtil.randomUnicodeString(random, maxWordLength);
-          }
-        }
+        // synthetic
+        text = randomAnalysisString(random, maxWordLength, simple);
       }
 
+
       if (VERBOSE) {
         System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
       }
@@ -581,6 +568,65 @@ public abstract class BaseTokenStreamTes
       }
     }
   }
+  
+  private static String randomAnalysisString(Random random, int maxLength, boolean simple) {
+    assert maxLength >= 0;
+    
+    // sometimes just a purely random string
+    if (random.nextInt(31) == 0) {
+      return randomSubString(random, random.nextInt(maxLength), simple);
+    }
+    
+    // otherwise, try to make it more realistic with 'words' since most tests use MockTokenizer
+    // first decide how big the string will really be: 0..n
+    maxLength = random.nextInt(maxLength);
+    int avgWordLength = _TestUtil.nextInt(random, 3, 8);
+    StringBuilder sb = new StringBuilder();
+    while (sb.length() < maxLength) {
+      if (sb.length() > 0) {
+        sb.append(' ');
+      }
+      int wordLength = -1;
+      while (wordLength < 0) {
+        wordLength = (int) (random.nextGaussian() * 3 + avgWordLength);
+      }
+      wordLength = Math.min(wordLength, maxLength - sb.length());
+      sb.append(randomSubString(random, wordLength, simple));
+    }
+    return sb.toString();
+  }
+  
+  private static String randomSubString(Random random, int wordLength, boolean simple) {
+    if (wordLength == 0) {
+      return "";
+    }
+    
+    int evilness = _TestUtil.nextInt(random, 0, 20);
+    
+    StringBuilder sb = new StringBuilder();
+    while (sb.length() < wordLength) {;
+      if (simple) { 
+        sb.append(random.nextBoolean() ? _TestUtil.randomSimpleString(random, wordLength) : _TestUtil.randomHtmlishString(random, wordLength));
+      } else {
+        if (evilness < 10) {
+          sb.append(_TestUtil.randomSimpleString(random, wordLength));
+        } else if (evilness < 15) {
+          sb.append(_TestUtil.randomRealisticUnicodeString(random, wordLength));
+        } else if (evilness == 16) {
+          sb.append(_TestUtil.randomHtmlishString(random, wordLength));
+        } else {
+          sb.append(_TestUtil.randomUnicodeString(random, wordLength));
+        }
+      }
+    }
+    if (sb.length() > wordLength) {
+      sb.setLength(wordLength);
+      if (Character.isHighSurrogate(sb.charAt(wordLength-1))) {
+        sb.setLength(wordLength-1);
+      }
+    }
+    return sb.toString();
+  }
 
   protected String toDot(Analyzer a, String inputText) throws IOException {
     final StringWriter sw = new StringWriter();

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1304821&r1=1304820&r2=1304821&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java Sat Mar 24 15:06:41 2012
@@ -212,7 +212,7 @@ public class _TestUtil {
   }
 
   public static String randomSimpleString(Random r, int maxLength) {
-    final int end = r.nextInt(maxLength);
+    final int end = nextInt(r, 0, maxLength);
     if (end == 0) {
       // allow 0 length
       return "";
@@ -237,7 +237,7 @@ public class _TestUtil {
    * Returns a random string up to a certain length.
    */
   public static String randomUnicodeString(Random r, int maxLength) {
-    final int end = r.nextInt(maxLength);
+    final int end = nextInt(r, 0, maxLength);
     if (end == 0) {
       // allow 0 length
       return "";
@@ -341,7 +341,7 @@ public class _TestUtil {
   };
   
   public static String randomHtmlishString(Random random, int numElements) {
-    final int end = random.nextInt(numElements);
+    final int end = nextInt(random, 0, numElements);
     if (end == 0) {
       // allow 0 length
       return "";
@@ -486,12 +486,12 @@ public class _TestUtil {
   
   /** Returns random string of length up to maxLength codepoints , all codepoints within the same unicode block. */
   public static String randomRealisticUnicodeString(Random r, int maxLength) {
-    return randomRealisticUnicodeString(r, 0, 20);
+    return randomRealisticUnicodeString(r, 0, maxLength);
   }
 
   /** Returns random string of length between min and max codepoints, all codepoints within the same unicode block. */
   public static String randomRealisticUnicodeString(Random r, int minLength, int maxLength) {
-    final int end = minLength + r.nextInt(maxLength);
+    final int end = nextInt(r, minLength, maxLength);
     final int block = r.nextInt(blockStarts.length);
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < end; i++)