You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2010/01/15 21:43:49 UTC
svn commit: r899806 - /pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java

Author: jukka
Date: Fri Jan 15 20:43:48 2010
New Revision: 899806

URL: http://svn.apache.org/viewvc?rev=899806&view=rev
Log:
PDFBOX-604: Various text extraction performance improvements

Most calls to ICU4JImpl.normalizePres(String) return the argument as-is, so it's best to only allocate a new buffer when one really is needed.

Modified:
    pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java

Modified: pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java?rev=899806&r1=899805&r2=899806&view=diff
==============================================================================
--- pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java (original)
+++ pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java Fri Jan 15 20:43:48 2010
@@ -76,40 +76,46 @@
      */
     public String normalizePres(String str)
     {
-        String retStr = "";
-        for (int i = 0; i < str.length(); i++) 
+        StringBuilder builder = null;
+        int p = 0;
+        int q = 0;
+        for (; q < str.length(); q++) 
         {
-            /* We only normalize if the codepoint is in a given range. Otherwise, 
-             * NFKC converts too many things that would cause confusion. For example,
-             * it converts the micro symbol in extended latin to the value in the greek
-             * script. We normalize the Unicode Alphabetic and Arabic A&B Presentation forms.
-             */
-            char c = str.charAt(i);
-            if (((c >= 0xFB00) && (c <= 0xFDFF)) ||
-                    ((c >= 0xFE70) && (c <= 0xFEFF)))
+            // We only normalize if the codepoint is in a given range.
+            // Otherwise, NFKC converts too many things that would cause
+            // confusion. For example, it converts the micro symbol in
+            // extended Latin to the value in the Greek script. We normalize
+            // the Unicode Alphabetic and Arabic A&B Presentation forms.
+            char c = str.charAt(q);
+            if ((0xFB00 <= c && c <= 0xFDFF) || (0xFE70 <= c && c <= 0xFEFF))
             {
-                /* Some fonts map U+FDF2 differently than the Unicode spec.
-                 * They add an extra U+0627 character to compensate.  
-                 * This removes the extra character for those fonts. */ 
-                if((c == 0xFDF2) && (i > 0) && ((str.charAt(i-1) == 0x0627) || (str.charAt(i-1) == 0xFE8D))) 
+                if (builder == null) {
+                    builder = new StringBuilder(str.length() * 2);
+                }
+                builder.append(str.substring(p, q));
+                // Some fonts map U+FDF2 differently than the Unicode spec.
+                // They add an extra U+0627 character to compensate.
+                // This removes the extra character for those fonts. 
+                if(c == 0xFDF2 && q > 0 && (str.charAt(q-1) == 0x0627 || str.charAt(q-1) == 0xFE8D))
                 {
-                    retStr += "\u0644\u0644\u0647";
+                    builder.append("\u0644\u0644\u0647");
                 }
                 else
                 {
-                    /*
-                     * Trim because some decompositions have an extra space, such as
-                     * U+FC5E
-                     */
-                    retStr += Normalizer.normalize(c, Normalizer.NFKC).trim(); 
+                    // Trim because some decompositions have an extra space,
+                    // such as U+FC5E
+                    builder.append(
+                            Normalizer.normalize(c, Normalizer.NFKC).trim());
                 }
-            }
-            else 
-            {
-                retStr += str.charAt(i);
+                p = q + 1;
             }
         }
-        return retStr;
+        if (builder == null) {
+            return str;
+        } else {
+            builder.append(str.substring(p, q));
+            return builder.toString();
+        }
     }
     
     /**