You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/01/13 16:17:32 UTC

svn commit: r734151 - in /incubator/pdfbox/trunk: ./ external/ src/main/java/org/apache/pdfbox/examples/util/ src/main/java/org/apache/pdfbox/pdfviewer/ src/main/java/org/apache/pdfbox/util/ src/main/java/org/apache/pdfbox/util/operator/ test/input/

Author: carrier
Date: Tue Jan 13 07:17:01 2009
New Revision: 734151

URL: http://svn.apache.org/viewvc?rev=734151&view=rev
Log:
Fix for PDFBOX-377 along with cleanup to make method names more consistent

Added:
    incubator/pdfbox/trunk/external/icu4j-4_0.jar   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java   (with props)
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java   (with props)
Modified:
    incubator/pdfbox/trunk/README.txt
    incubator/pdfbox/trunk/build.xml
    incubator/pdfbox/trunk/pom.xml
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
    incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
    incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
    incubator/pdfbox/trunk/test/input/cweb.pdf.txt

Modified: incubator/pdfbox/trunk/README.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/README.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/README.txt (original)
+++ incubator/pdfbox/trunk/README.txt Tue Jan 13 07:17:01 2009
@@ -30,6 +30,15 @@
    in your classpath. The easiest solution is to simply include the
    apache-pdfbox-x.x.x.jar in your classpath.
 
+3. You get text that has the correct characters, but in the wrong
+   order.  This mght be because you have not enabled sorting.  The text
+   in PDF files is stored in chunks and the chunks do not need to be stored 
+   in the order that they are displayed on a page.  By default, PDFBox does 
+   not sort the text.  Also, if you have text in a language that reads right to left 
+   (such as Arabic or Hebrew), make sure you have the ICU4J jar file in your 
+   classpath.  This library is needed to properly hande right to left text.
+
+
 See the issue tracker at https://issues.apache.org/jira/browse/PDFBOX for
 the full list of known issues and requested features.
 

Modified: incubator/pdfbox/trunk/build.xml
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/build.xml?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/build.xml (original)
+++ incubator/pdfbox/trunk/build.xml Tue Jan 13 07:17:01 2009
@@ -69,6 +69,7 @@
     <property name="jai_core.jar" value="${jar.dir}/${jai_core.name}.jar" />
 
     <property name="junit.jar" value="${jar.dir}/junit.jar" />
+    <property name="icu4j.jar" value="${jar.dir}/icu4j-4_0.jar" />
 
     <property name="testoutput.dir" value="test/output"/>
 
@@ -116,6 +117,7 @@
         <junit printsummary="off" fork="on" dir=".">
             <sysproperty key="org.apache.pdfbox.util.TextStripper.file" value="${file}"/>
             <classpath>
+                <pathelement path="${junit.jar}" />
                 <pathelement path="${ant.jar}" />
                 <pathelement path="${lucene.jar}" />
                 <pathelement path="${lucene-demo.jar}" />
@@ -125,6 +127,7 @@
                 <pathelement path="${bcmail.jar}" />
                 <pathelement path="${jai_codec.jar}" />
                 <pathelement path="${jai_core.jar}" />
+                <pathelement path="${icu4j.jar}" />
                 <pathelement path="${dest.dir}"/>
                 <pathelement path="${resources.dir}" />
             </classpath>
@@ -140,6 +143,7 @@
 
             <sysproperty key="org.apache.pdfbox.util.TextStripper.file" value="${file}"/>
             <classpath>
+                <pathelement path="${junit.jar}" />
                 <pathelement path="${ant.jar}" />
                 <pathelement path="${lucene.jar}" />
                 <pathelement path="${lucene-demo.jar}" />
@@ -149,6 +153,7 @@
                 <pathelement path="${bcmail.jar}" />
                 <pathelement path="${jai_codec.jar}" />
                 <pathelement path="${jai_core.jar}" />
+                <pathelement path="${icu4j.jar}" />
                 <pathelement path="${dest.dir}"/>
                 <pathelement path="${resources.dir}" />
             </classpath>
@@ -160,6 +165,7 @@
     <target name="test-junit" depends="clean,compile" description="run junit tests">
         <junit printsummary="off" fork="on" dir=".">
             <classpath>
+                <pathelement path="${junit.jar}" />
                 <pathelement path="${ant.jar}" />
                 <pathelement path="${lucene.jar}" />
                 <pathelement path="${lucene-demo.jar}" />
@@ -286,6 +292,7 @@
                 <include name="${jai_codec.jar}" />
                 <include name="${jai_core.jar}" />
                 <include name="${checkstyle.jar}" />
+                <include name="${icu4j.jar}" />
                 <include name="${bin.dir}/**/*" />
                 <include name="pom.xml" />
                 <include name="build.xml" />

Added: incubator/pdfbox/trunk/external/icu4j-4_0.jar
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/external/icu4j-4_0.jar?rev=734151&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/pdfbox/trunk/external/icu4j-4_0.jar
------------------------------------------------------------------------------
    svn:executable = *

Propchange: incubator/pdfbox/trunk/external/icu4j-4_0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: incubator/pdfbox/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/pom.xml?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/pom.xml (original)
+++ incubator/pdfbox/trunk/pom.xml Tue Jan 13 07:17:01 2009
@@ -92,6 +92,13 @@
     </dependency>
 
     <dependency>
+        <groupId>com.ibm.icu</groupId>
+        <artifactId>icu4j</artifactId>
+        <version>3.8</version>
+        <optional>true</optional>
+    </dependency>
+
+    <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
         <version>4.5</version>

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java Tue Jan 13 07:17:01 2009
@@ -105,11 +105,11 @@
 
     /**
      * A method provided as an event interface to allow a subclass to perform
-     * some specific functionality when a character needs to be displayed.
+     * some specific functionality when text needs to be processed
      *
-     * @param text The character to be displayed.
+     * @param text The text to be processed
      */
-    protected void showCharacter( TextPosition text )
+    protected void processTextPosition( TextPosition text )
     {
         System.out.println( "String[" + text.getX() + "," +
                 text.getY() + " fs=" + text.getFontSize() + " xscale=" +

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java Tue Jan 13 07:17:01 2009
@@ -130,11 +130,11 @@
 
     /**
      * You should override this method if you want to perform an action when a
-     * string is being shown.
+     * text is being processed. 
      *
-     * @param text The string to display.
+     * @param text The text to process 
      */
-    protected void showCharacter( TextPosition text )
+    protected void processTextPosition( TextPosition text )
     {
         //should use colorspaces for the font color but for now assume that
         //the font color is black

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java?rev=734151&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java Tue Jan 13 07:17:01 2009
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import com.ibm.icu.text.Bidi;
+import com.ibm.icu.text.Normalizer;
+
+/**
+ * This class is an implementation the the ICU4J class. TextNormalize 
+ * will call this only if the ICU4J library exists in the classpath.
+ */
+public class ICU4JImpl {
+    Bidi bidi;
+
+    public ICU4JImpl() {
+        bidi = new Bidi();
+
+        /* We do not use bidi.setInverse() because that uses
+         * Bidi.REORDER_INVERSE_NUMBERS_AS_L, which caused problems
+         * in some test files. For example, a file had a line of:
+         * 0 1 / ARABIC
+         * and the 0 and 1 were reversed in the end result.  
+         * REORDER_INVERSE_LIKE_DIRECT is the inverse Bidi mode 
+         * that more closely reflects the Unicode spec.
+         */
+        bidi.setReorderingMode(Bidi.REORDER_INVERSE_LIKE_DIRECT);
+    }
+
+    /**
+     * Takes a line of text in presentation order and converts it to logical order.
+     * @see TextNormalize.makeLineLogicalOrder(String, boolean)     
+     *  
+     */
+    public String makeLineLogicalOrder(String a_str, boolean a_isRtlDominant) {    	
+        bidi.setPara(a_str, a_isRtlDominant?Bidi.RTL:Bidi.LTR, null);
+
+        /* Set the mirror flag so that parentheses and other mirror symbols
+         * are properly reversed, when needed.  With this removed, lines
+         * such as (CBA) in the PDF file will come out like )ABC( in logical
+         * order.
+         */
+        return bidi.writeReordered(Bidi.DO_MIRRORING);
+    }
+
+    /**
+     * Normalize presentation forms of characters to the seperate parts.
+     * @see TextNormalize.normalizePres(String)
+     * 
+     * @param a_str String to normalize
+     * @return Normalized form
+     */
+    public String normalizePres(String a_str) {
+        String retStr = "";
+        for (int i = 0; i < a_str.length(); i++) {
+            /* We only normalize if the codepoint is in a given range. Otherwise, 
+             * NFKC converts too many things that would cause confusion. For example,
+             * it converts the micro symbol in extended latin to the value in the greek
+             * script. 
+             */
+            if (((a_str.codePointAt(i) >= 0xFB00) && (a_str.codePointAt(i) <= 0xFDFF)) ||
+                    ((a_str.codePointAt(i) >= 0xFE70) && (a_str.codePointAt(i) <= 0xFEFF)))	{
+                retStr += Normalizer.normalize(a_str.charAt(i), Normalizer.NFKC);
+            }
+            else {
+                retStr += a_str.charAt(i);
+            }
+        }
+        return retStr; 
+    }
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Tue Jan 13 07:17:01 2009
@@ -224,26 +224,44 @@
     }
 
     /**
+     * @deprecated
+     * @see processTextPosition(TextPosition)
+     */
+    protected void showCharacter( TextPosition text )
+    {
+    	processTextPosition(text);
+    }
+    
+    /**
      * A method provided as an event interface to allow a subclass to perform
-     * some specific functionality when a character needs to be displayed.
+     * some specific functionality when text needs to be processed.
      *
-     * @param text The character to be displayed.
+     * @param text The text to be processed.
      */
-    protected void showCharacter( TextPosition text )
+    protected void processTextPosition( TextPosition text )
     {
         //subclasses can override to provide specific functionality.
     }
 
+    
+    /**
+     * @deprecated
+     * @see processEncodedText(byte[])
+     */
+    public void showString( byte[] string ) throws IOException {
+    	processEncodedText(string);
+    }
+    
     /**
-     * You should override this method if you want to perform an action when a
-     * string is being shown.
+     * Process encoded text from the PDF Stream. 
+     * You should override this method if you want to perform an action when 
+     * encoded text is being processed.
      *
-     * @param string The string to display.
+     * @param string The encoded text
      *
-     * @throws IOException If there is an error showing the string
+     * @throws IOException If there is an error processing the string
      */
-
-    public void showString( byte[] string ) throws IOException
+    public void processEncodedText( byte[] string ) throws IOException
     {
     	/* Note on variable names.  There are three different units being used
     	 * in this code.  Character sizes are given in glyph units, text locations
@@ -425,7 +443,8 @@
         // convert textMatrix at the end of the string to display units
         Matrix textMatrixEndDisp = initialMatrix.multiply( textMatrix ).multiply( ctm );
         
-        showCharacter(
+        // process the decoded text
+        processTextPosition(
                 new TextPosition(
                 		page,
                 		textMatrixStDisp,

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Tue Jan 13 07:17:01 2009
@@ -68,7 +68,7 @@
         buf.append("</title>");
         buf.append("</head>");
         buf.append("<body>\n");
-        getOutput().write(buf.toString());
+        super.writeString(buf.toString());
     }
 
     /**
@@ -82,9 +82,17 @@
     }
 
     /**
+     * @deprecated
      * {@inheritDoc}
      */
-    protected void flushText() throws IOException
+    protected void flushText() throws IOException {
+        writePage();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    protected void writePage() throws IOException
     {
         Iterator textIter = getCharactersByArticle().iterator();
 
@@ -94,7 +102,7 @@
             writeHeader();
             onFirstPage = false;
         }
-        super.flushText();
+        super.writePage();
     }
 
     /**
@@ -102,7 +110,7 @@
      */
     public void endDocument(PDDocument pdf) throws IOException
     {
-        output.write("</body></html>");
+        super.writeString("</body></html>");
     }
 
     /**
@@ -170,7 +178,7 @@
     {
         if (! suppressParagraphs)
         {
-            getOutput().write("<p>");
+            super.writeString("<p>");
         }
     }
     /**
@@ -182,58 +190,65 @@
     {
         if (! suppressParagraphs)
         {
-            getOutput().write("</p>");
+            super.writeString("</p>");
         }
     }
 
     /**
-     * {@inheritDoc}
+     * Write a string to the output stream and escape some HTML characters
      */
-    protected void writeCharacters(TextPosition position ) throws IOException
+    protected void writeString(String chars) throws IOException
     {
-        if (position == beginTitle)
-        {
-            output.write("<H1>");
-            suppressParagraphs = true;
-        }
-        if (position == afterEndTitle)
-        {
-            output.write("</H1>");  // end title and start first paragraph
-            suppressParagraphs = false;
-        }
-
-        String chars = position.getCharacter();
-
         for (int i = 0; i < chars.length(); i++)
         {
             char c = chars.charAt(i);
             if ((c < 32) || (c > 126))
             {
                 int charAsInt = c;
-                output.write("&#" + charAsInt + ";");
+                super.writeString("&#" + charAsInt + ";");
             }
             else
             {
                 switch (c)
                 {
-                    case 34:
-                        output.write("&quot;");
-                        break;
-                    case 38:
-                        output.write("&amp;");
-                        break;
-                    case 60:
-                        output.write("&lt;");
-                        break;
-                    case 62:
-                        output.write("&gt;");
-                        break;
-                    default:
-                        output.write(c);
+                case 34:
+                    super.writeString("&quot;");
+                    break;
+                case 38:
+                    super.writeString("&amp;");
+                    break;
+                case 60:
+                    super.writeString("&lt;");
+                    break;
+                case 62:
+                    super.writeString("&gt;");
+                    break;
+                default:
+                    super.writeString(String.valueOf(c));
                 }
             }
         }
     }
+    
+    /**
+     * {@inheritDoc}
+     */
+    protected void writeCharacters(TextPosition position ) throws IOException
+    {
+        if (position == beginTitle)
+        {
+            super.writeString("<H1>");
+            suppressParagraphs = true;
+        }
+        if (position == afterEndTitle)
+        {
+            super.writeString("</H1>");  // end title and start first paragraph
+            suppressParagraphs = false;
+        }
+
+        writeString(position.getCharacter());
+    }
+    
 
     /**
      * @return Returns the suppressParagraphs.

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Jan 13 07:17:01 2009
@@ -46,6 +46,10 @@
  * formatting and such.  Please note; it is up to clients of this class to verify that
  * a specific user has the correct permissions to extract text from the
  * PDF document.
+ * 
+ * The basic flow of this process is that we get a document and use a series of 
+ * processXXX() functions that work on smaller and smaller chunks of the page.  
+ * Eventually, we fully process each page and then print it. 
  *
  * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
  * @version $Revision: 1.70 $
@@ -94,6 +98,12 @@
     protected Writer output;
 
     /**
+     * The normalizer is used to remove text ligatures/presentation forms
+     * and to correct the direction of right to left text, such as Arabic and Hebrew.
+     */
+    private TextNormalize normalize = null;
+    
+    /**
      * Instantiate a new PDFTextStripper object.  This object will load properties from
      * Resources/PDFTextStripper.properties.
      * @throws IOException If there is an error loading the properties.
@@ -318,7 +328,7 @@
 
             characterListMapping.clear();
             processStream( page, page.findResources(), content );
-            flushText();
+            writePage();
             endPage( page );
         }
 
@@ -373,11 +383,22 @@
     }
 
     /**
-     * This will print the text to the output stream.
+     * @deprecated
+     * @see writePage(). 
+     */
+    protected void flushText() throws IOException {
+        writePage();
+    }
+
+    /**
+     * This will print the text of the processed page to "output".
+     * It will estimate, based on the coordinates of the text, where
+     * newlines and word spacings should be placed. The text will be
+     * sorted only if that feature was enabled. 
      *
      * @throws IOException If there is an error writing the text.
      */
-    protected void flushText() throws IOException
+    protected void writePage() throws IOException    
     {
         float maxYForLine = -1;
         float minYTopForLine = Float.MAX_VALUE;
@@ -389,6 +410,11 @@
         float maxHeightForLine = -1;
         //float lastHeightForLine = -1;
         TextPosition lastPosition = null;
+        
+        if (normalize == null) {
+            normalize = new TextNormalize();
+        }
+        
         for( int i = 0; i < charactersByArticle.size(); i++)
         {
             startParagraph();
@@ -400,6 +426,63 @@
             }
 
             Iterator textIter = textList.iterator();
+                        
+            /* Before we can display the text, we need to do some normalizing.
+             * Arabic and Hebrew text is right to left and is typically stored
+             * in its logical format, which means that the rightmost character is 
+             * stored first, followed by the second character from the right etc.
+             * However, PDF stores the text in presentation form, which is left to 
+             * right.  We need to do some normalization to convert the PDF data to
+             * the proper logical output format. 
+             * 
+             * Note that if we did not sort the text, then the output of reversing the
+             * text is undefined and can sometimes produce worse output then not trying
+             * to reverse the order.  Sorting should be done for these languages.
+             * */
+
+            /* First step is to determine if we have any right to left text, and 
+             * if so, is it dominant. */ 
+            int ltrCnt = 0;
+            int rtlCnt = 0;
+
+            while( textIter.hasNext() )
+            {
+                TextPosition position = (TextPosition)textIter.next();
+                String stringValue = position.getCharacter();
+
+                for (int a = 0; a < stringValue.length(); a++) {
+                    byte dir = Character.getDirectionality(stringValue.charAt(a));
+                    if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) || 
+                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
+                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE )) {
+                        ltrCnt++;
+                    }
+                    else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
+                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
+                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
+                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE )) {
+                        rtlCnt++;
+                    }
+                }
+            }
+
+            // choose the dominant direction
+            boolean isRtlDominant = false; 
+            if (rtlCnt > ltrCnt) {
+                isRtlDominant = true;
+            }
+
+            // we will later use this to skip reordering
+            boolean hasRtl = false;
+            if (rtlCnt > 0)
+                hasRtl = true;
+
+            /* Now cycle through to print the text.  
+             * We queue up a line at a time before we print so that we can convert
+             * the line from presentation form to logical form (if needed). */
+            String lineStr = "";
+            
+            textIter = textList.iterator();    // start from the beginning again
             while( textIter.hasNext() )
             {
                 TextPosition position = (TextPosition)textIter.next();
@@ -501,7 +584,22 @@
                     if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) ))
                     		//maxYForLine - minYTopForLine)))
                     {
-                        processLineSeparator( position );
+                        // If we have RTL text on the page, change the direction
+                        if (hasRtl)
+                            lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
+
+                        /* normalize string to remove presentation forms.
+                         * Note that this must come after the line direction 
+                         * conversion because the process looks ahead to the next
+                         * logical character. 
+                         */
+                        lineStr = normalize.normalizePres(lineStr);
+
+                        writeString(lineStr);
+                        lineStr = "";
+
+                        writeLineSeparator( );
+
                         endOfLastTextX = -1;
                         expectedStartOfNextWordX = -1;
                         maxYForLine = -1;
@@ -512,12 +610,12 @@
                     }
 
 
-	                if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
-	                   //only bother adding a space if the last character was not a space
-	                   lastPosition.getCharacter() != null &&
-	                   !lastPosition.getCharacter().endsWith( " " ) )
-	                {
-	                    processWordSeparator( lastPosition, position );
+                    if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
+                            //only bother adding a space if the last character was not a space
+                            lastPosition.getCharacter() != null &&
+                            !lastPosition.getCharacter().endsWith( " " ) )
+                    {
+	                    lineStr += getWordSeparator();
 	                }
 	                else
 	                {
@@ -535,9 +633,10 @@
                 endOfLastTextX = positionX + positionWidth;
                 //endOfLastTextY = positionY;
 
+                // add it to the list
                 if (characterValue != null)
                 {
-                    writeCharacters( position );
+                    lineStr += characterValue;
                 }
                 else
                 {
@@ -549,36 +648,81 @@
                 //lastHeightForLine = position.getHeight();
                 lastWordSpacing = wordSpacing;
             }
+            
+            // print the final line
+            if (lineStr.length() > 0) {
+                if (hasRtl)
+                    lineStr = normalize.makeLineLogicalOrder(lineStr, isRtlDominant);
+
+                // normalize string to remove presentation forms
+                lineStr = normalize.normalizePres(lineStr);
+
+                writeString(lineStr);
+            }
+            
             endParagraph();
         }
 
+        writePageSeperator();;
+    }
 
+    private boolean overlap( float y1, float height1, float y2, float height2 )
+    {
+        return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
+               (y1 <= y2 && y1 >= y2-height2);
+    }
+
+    /**
+     * Write the page separator value to the output stream
+     * @throws IOException
+     */
+    protected void writePageSeperator() throws IOException
+    {
         // RDD - newline at end of flush - required for end of page (so that the top
         // of the next page starts on its own line.
         //
         output.write(getPageSeparator());
-
         output.flush();
     }
-
-    private boolean overlap( float y1, float height1, float y2, float height2 )
+    
+    /**
+     * Write the line separator value to the output stream
+     * @throws IOException
+     */
+    protected void writeLineSeparator( ) throws IOException
     {
-        return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
-               (y1 <= y2 && y1 >= y2-height2);
+        output.write(getLineSeparator());
     }
-
+    
+    /**
+     * @deprecated
+     * @see writeLineSeparator()
+     */
     protected void processLineSeparator( TextPosition currentText ) throws IOException
     {
-        output.write(getLineSeparator());
+    	writeLineSeparator();
     }
 
-    protected void processWordSeparator( TextPosition lastText, TextPosition currentText ) throws IOException
+    /**
+     * Write the word separator value to the output stream
+     * @throws IOException
+     */
+    protected void writeWordSeparator() throws IOException
     {
         output.write(getWordSeparator());
     }
+    
+    /**
+     * @deprecated
+     * @see writeWordSeparator() 
+     */
+    protected void processWordSeparator( TextPosition lastText, TextPosition currentText ) throws IOException
+    {
+    	writeWordSeparator();
+    }
 
     /**
-     * Write the string to the output stream.
+     * Write the string in TextPosition to the output stream.
      *
      * @param text The text to write to the stream.
      * @throws IOException If there is an error when writing the text.
@@ -587,6 +731,17 @@
     {
         output.write( text.getCharacter() );
     }
+    
+    /**
+     * Write a Java string to the output stream.
+     *
+     * @param text The text to write to the stream.
+     * @throws IOException If there is an error when writing the text.
+     */
+    protected void writeString( String text ) throws IOException
+    {
+        output.write( text );
+    }
 
     /**
      * This will determine of two floating point numbers are within a specified variance.
@@ -600,14 +755,25 @@
         return second > first - variance && second < first + variance;
     }
 
+    
     /**
-     * This will show add a character to the list of characters to be printed to
-     * the text file.
-     *
-     * @param text The description of the character to display.
+     * @deprecated
+     * {@inheritDoc}
      */
     protected void showCharacter( TextPosition text )
     {
+    	processTextPosition(text);
+    }
+    
+    /**
+     * This will process a TextPosition object and add the
+     * text to the list of characters on a page.  It takes care of
+     * overlapping text.
+     *
+     * @param text The text to process.
+     */
+    protected void processTextPosition( TextPosition text )
+    {
         boolean showCharacter = true;
         if( suppressDuplicateOverlappingText )
         {

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java Tue Jan 13 07:17:01 2009
@@ -116,9 +116,17 @@
     }
 
     /**
+     * @deprecated
      * {@inheritDoc}
      */
-    protected void showCharacter( TextPosition text )
+    protected void showCharacter( TextPosition text ) {
+        processTextPosition(text);
+    }
+    
+    /**
+     * {@inheritDoc}
+     */
+    protected void processTextPosition( TextPosition text )
     {
         Iterator regionIter = regionArea.keySet().iterator();
         while( regionIter.hasNext() )
@@ -128,17 +136,26 @@
             if( rect.contains( text.getX(), text.getY() ) )
             {
                 charactersByArticle = (Vector)regionCharacterList.get( region );
-                super.showCharacter( text );
+                super.processTextPosition( text );
             }
         }
     }
 
+    
+    /**
+     * @deprecated
+     * {@inheritDoc}
+     */
+    protected void flushText() throws IOException {
+    	writePage();
+    }
+    
     /**
-     * This will print the text to the output stream.
+     * This will print the processed page text to the output stream.
      *
      * @throws IOException If there is an error writing the text.
      */
-    protected void flushText() throws IOException
+    protected void writePage() throws IOException
     {
         Iterator regionIter = regionArea.keySet().iterator();
         while( regionIter.hasNext() )
@@ -146,7 +163,7 @@
             String region = (String)regionIter.next();
             charactersByArticle = (Vector)regionCharacterList.get( region );
             output = (StringWriter)regionText.get( region );
-            super.flushText();
+            super.writePage();
         }
     }
 }

Added: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java?rev=734151&view=auto
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java (added)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java Tue Jan 13 07:17:01 2009
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+/**
+ * This class allows a caller to normalize text in various ways.
+ * It will load the ICU4J jar file if it is defined on the classpath.
+ * 
+ */
+public class TextNormalize {
+    private ICU4JImpl icu4j = null;
+
+    public TextNormalize() {
+        findICU4J();
+    }
+
+
+    private void findICU4J() {
+        // see if we can load the icu4j classes from the classpath
+        try {
+            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi");
+            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer");
+            icu4j = new ICU4JImpl();
+        } catch (ClassNotFoundException e) {
+            icu4j = null;
+        }
+    }
+
+
+    /**
+     * Takes a line of text in presentation order and converts it to logical order.
+     * For most text other than Arabic and Hebrew, the presentation and logical
+     * orders are the same. However, for Arabic and Hebrew, they are different and
+     * if the text involves both RTL and LTR text then the Unicode BIDI algorithm
+     * must be used to determine how to map between them.  
+     * 
+     * @param a_str Presentation form of line to convert (i.e. left most char is first char)
+     * @param a_isRtlDominant true if the PAGE has a dominant right to left ordering
+     * @return Logical form of string (or original string if ICU4J library is not on classpath)
+     */
+    public String makeLineLogicalOrder(String a_str, boolean a_isRtlDominant) {
+        if (icu4j != null) {
+            return icu4j.makeLineLogicalOrder(a_str, a_isRtlDominant);
+        }
+        else {
+            return a_str;
+        }
+    }
+
+    /**
+     * Normalize the presentation forms of characters in the string.
+     * For example, convert the single "fi" ligature to "f" and "i". 
+     * 
+     * @param a_str String to normalize
+     * @return Normalized string (or original string if ICU4J library is not on classpath)
+     */
+    public String normalizePres(String a_str) {
+        if (icu4j != null) {
+            return icu4j.normalizePres(a_str);
+        }
+        else {
+            return a_str;
+        }
+    }
+}

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java Tue Jan 13 07:17:01 2009
@@ -47,7 +47,7 @@
     public void process(PDFOperator operator, List arguments) throws IOException
     {
         COSString string = (COSString)arguments.get( 0 );
-        context.showString( string.getBytes() );
+        context.processEncodedText( string.getBytes() );
     }
 
 }

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java Tue Jan 13 07:17:01 2009
@@ -63,7 +63,7 @@
             }
             else if( next instanceof COSString )
             {
-                context.showString( ((COSString)next).getBytes() );
+                context.processEncodedText( ((COSString)next).getBytes() );
             }
             else
             {

Modified: incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/cweb.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf.txt?rev=734151&r1=734150&r2=734151&view=diff
==============================================================================
Binary files - no diff available.



Re: svn commit: r734151 - in /incubator/pdfbox/trunk: ./ external/ src/main/java/org/apache/pdfbox/examples/util/ src/main/java/org/apache/pdfbox/pdfviewer/ src/main/java/org/apache/pdfbox/util/ src/main/java/org/apache/pdfbox/util/operator/ test/input/

Posted by Jeremias Maerki <de...@jeremias-maerki.ch>.
FYI, this change introduced a Java 1.5 dependency in ICU4JImpl.

Please set up your PDFBox working copies so they are compiled using Java
1.4 in your IDEs. Thanks.

On 13.01.2009 16:17:32 carrier wrote:
> Author: carrier
> Date: Tue Jan 13 07:17:01 2009
> New Revision: 734151
> 
> URL: http://svn.apache.org/viewvc?rev=734151&view=rev
> Log:
> Fix for PDFBOX-377 along with cleanup to make method names more consistent
> 
> Added:
>     incubator/pdfbox/trunk/external/icu4j-4_0.jar   (with props)
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/ICU4JImpl.java   (with props)
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextNormalize.java   (with props)
> Modified:
>     incubator/pdfbox/trunk/README.txt
>     incubator/pdfbox/trunk/build.xml
>     incubator/pdfbox/trunk/pom.xml
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/examples/util/PrintTextLocations.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/pdfviewer/PageDrawer.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowText.java
>     incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java
>     incubator/pdfbox/trunk/test/input/10101-AR.pdf.txt
>     incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
>     incubator/pdfbox/trunk/test/input/cweb.pdf.txt
> 
<snip/>


Jeremias Maerki