You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2013/01/17 21:20:18 UTC

svn commit: r1434892 - /uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java

Author: schor
Date: Thu Jan 17 20:20:18 2013
New Revision: 1434892

URL: http://svn.apache.org/viewvc?rev=1434892&view=rev
Log:
[UIMA-2515] do better work-around for sort failure on large arrays, using TreeSet

Modified:
    uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java?rev=1434892&r1=1434891&r2=1434892&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/OptimizeStrings.java Thu Jan 17 20:20:18 2013
@@ -22,8 +22,11 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
 
 /**
  * Share common underlying char[] among strings: Optimize sets of strings for
@@ -261,37 +264,22 @@ public class OptimizeStrings {
     optimizeI(sa); 
     inStrings = new ArrayList<String>();  // release space
   }
-  
-  /**
-   * Internal routine works on a copy of the original array and sorts it
-   */
-  public void optimize(String [] sa, int start, int end) {
-    int length = end - start;
-    String[] sortedStrings = new String[length];
-    System.arraycopy(sa, start, sortedStrings, 0, length);
-    optimizeI(sortedStrings);
-  }
-   
-  private void optimizeI(final String[] sortedStrings) {
+     
+  private void optimizeI(String[] sortedStrings) {
     savedCharsExact = 0;
     savedCharsSubstr = 0;
     StringBuilder sb = new StringBuilder();
+        
+    String[] sortedStrings2 = sortStrings(sortedStrings);
     
-    // debug intermittent sort failure
-//    String[] temp = new String[sortedStrings.length];
-//    
-//    for (debugi = 0; debugi < 1000; debugi++) {
-//      System.arraycopy(sortedStrings, 0, temp, 0, sortedStrings.length);
-//      sortStrings(temp);
-//      if ((debugi % 50) == 0) {
-//        System.out.println(" " + debugi);
-//      }
-//    }
-    
-    sortStrings(sortedStrings);
-    
-    int ssLength = eliminateSortedStringDuplicates(sortedStrings);
- 
+    int ssLength;
+    if (sortedStrings2 != sortedStrings) {
+      // in this case, duplicates have already been eliminated
+      ssLength = sortedStrings2.length;
+      sortedStrings = sortedStrings2;
+    } else {
+      ssLength = eliminateSortedStringDuplicates(sortedStrings);
+    }
     // create Offsets table
     // sorted so longer ones follow shorter subsumed string
     String previous = "";
@@ -369,23 +357,46 @@ public class OptimizeStrings {
     return to;  // to is length, is also where next string would be copied to
   }
   
-  private void sortStrings(String[] sa) {
-    int retryCount = 0;
-    while (true) {
-      try {
-        Arrays.sort(sa); 
-      } catch (StackOverflowError e) {
-//        throw e;
-        System.err.format("Caught ArraySort stack overflow, original array is " +
-            " %,d entries%n", inStrings.size());
-        sa = inStrings.toArray(new String[inStrings.size()]);
-        retryCount ++;
-        if (retryCount < 20) { // 2 is too low to avoid JIT issue
-          continue;
+  private String[] sortStrings(String[] sa) {
+    try {
+      Arrays.sort(sa);
+
+      // testing
+//      Set<String> orderedSet = new TreeSet<String>();
+//      for (String s : inStrings) {
+//        orderedSet.add(s);
+//      }
+//      Iterator<String> it = orderedSet.iterator();
+//      for (int i = 0; i < sa.length; i++) {
+//        String s = it.next();
+//        if (sa[i] != s) {
+//          throw new RuntimeException(String.format(
+//              "Mismatch, old way sort(i = %,d) = %s, new way is %s", i, s));
+//        }
+//        while ((i < sa.length - 1) && (sa[i + 1].equals(sa[i]))) {
+//         i++;
+//        }
+//      }
+      // end of test
+      
+      
+      return sa;
+    } catch (StackOverflowError e) {
+      // see https://issues.apache.org/jira/browse/UIMA-2515
+      // debug/test
+//      System.out.println("hit stack overflow");
+      Set<String> orderedSet = new TreeSet<String>();
+      for (String s : inStrings) {
+        if (!orderedSet.add(s)) {
+          savedCharsExact += s.length();
         }
-        throw e;
       }
-      break;
+      sa = new String[orderedSet.size()]; // has dups removed already
+      Iterator<String> it = orderedSet.iterator();
+      for (int i = 0; i < sa.length; i++) {
+        sa[i] = it.next();
+      }
+      return sa;
     }
   }
 }