You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by je...@apache.org on 2009/01/04 16:48:48 UTC

svn commit: r731278 - /incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Author: jeremias
Date: Sun Jan  4 07:48:47 2009
New Revision: 731278

URL: http://svn.apache.org/viewvc?rev=731278&view=rev
Log:
Restored Java 1.4 compatibility.

Modified:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=731278&r1=731277&r2=731278&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sun Jan  4 07:48:47 2009
@@ -19,7 +19,6 @@
 import java.io.IOException;
 import java.io.StringWriter;
 import java.io.Writer;
-
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -31,20 +30,16 @@
 
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSStream;
-
+import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.exceptions.InvalidPasswordException;
+import org.apache.pdfbox.exceptions.WrappedIOException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
-
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.common.PDStream;
-
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
 
-import org.apache.pdfbox.exceptions.CryptographyException;
-import org.apache.pdfbox.exceptions.InvalidPasswordException;
-import org.apache.pdfbox.exceptions.WrappedIOException;
-
 
 /**
  * This class will take a pdf document and strip out all of the text and ignore the
@@ -383,7 +378,7 @@
      * @throws IOException If there is an error writing the text.
      */
     protected void flushText() throws IOException
-    { 
+    {
         float maxYForLine = -1;
         float minYTopForLine = Float.MAX_VALUE;
         //float lastBaselineFontSize = -1;
@@ -394,28 +389,28 @@
         float maxHeightForLine = -1;
         //float lastHeightForLine = -1;
         TextPosition lastPosition = null;
-        for( int i=0; i<charactersByArticle.size(); i++)
+        for( int i = 0; i < charactersByArticle.size(); i++)
         {
             startParagraph();
-            List<TextPosition> textList = (List<TextPosition>)charactersByArticle.get( i );
+            List textList = (List)charactersByArticle.get( i );
             if( sortByPosition )
             {
                 TextPositionComparator comparator = new TextPositionComparator();
                 Collections.sort( textList, comparator );
             }
-            
-            Iterator<TextPosition> textIter = textList.iterator();
+
+            Iterator textIter = textList.iterator();
             while( textIter.hasNext() )
             {
-                TextPosition position = textIter.next();
+                TextPosition position = (TextPosition)textIter.next();
                 String characterValue = position.getCharacter();
-                
+
                 float positionX;
                 float positionY;
                 float positionWidth;
                 float positionHeight;
-                
-                /* If we are sorting, then we need to use the text direction 
+
+                /* If we are sorting, then we need to use the text direction
                  * adjusted coordinates, because they were used in the sorting. */
                 if (sortByPosition) {
                 	positionX = position.getXDirAdj();
@@ -429,8 +424,8 @@
                 	positionWidth = position.getWidth();
                 	positionHeight = position.getHeight();
                 }
-                
-                
+
+
                 float wordSpacing = 0;
                 /* float wordSpacing = position.getWordSpacing();	BC: When I re-enabled this for a a test, lots of extra spaces were added
                 if( wordSpacing == 0 )
@@ -445,8 +440,8 @@
                       wordSpacing = positionWidth;
                     }
                 //}
-                
-                
+
+
                 // RDD - We add a conservative approximation for space determination.
                 // basically if there is a blank area between two characters that is
                 //equal to some percentage of the word spacing then that will be the
@@ -459,7 +454,7 @@
                 {
                     expectedStartOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
                 }
-    
+
                 // RDD - We will suppress text that is very close to the current line
                 // and which overwrites previously rendered text on this line.
                 // This is done specifically to handle a reasonably common situation
@@ -480,31 +475,31 @@
                     }
                     continue;
                 }*/
-    
+
                 // RDD - Here we determine whether this text object is on the current
                 // line.  We use the lastBaselineFontSize to handle the superscript
                 // case, and the size of the current font to handle the subscript case.
                 // Text must overlap with the last rendered baseline text by at least
                 // a small amount in order to be considered as being on the same line.
                 //
-                
+
                 //int verticalScaling = 1;
                 //if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
                 //{
                 //    verticalScaling = -1;
                 //}
-                
+
                 if( lastPosition != null )
                 {
                     //if (currentY != -1 &&
                     //    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
                     //     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
                     //{
-                    /* XXX BC: In theory, this check should really check if the next char is in full range 
+                    /* XXX BC: In theory, this check should really check if the next char is in full range
                      * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
                      * of regression test failures.  So, I'm leaving it be for now. */
-                    if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) )) 
-                    		//maxYForLine - minYTopForLine))) 
+                    if( ( !overlap( positionY, positionHeight, maxYForLine, maxHeightForLine ) ))
+                    		//maxYForLine - minYTopForLine)))
                     {
                         processLineSeparator( position );
                         endOfLastTextX = -1;
@@ -515,8 +510,8 @@
                         minYTopForLine = Float.MAX_VALUE;
                         //lastHeightForLine = -1;
                     }
-                
-    
+
+
 	                if (expectedStartOfNextWordX != -1 && expectedStartOfNextWordX < positionX &&
 	                   //only bother adding a space if the last character was not a space
 	                   lastPosition.getCharacter() != null &&
@@ -529,17 +524,17 @@
 	                    //System.out.println( "Not a word separator " + position.getCharacter() +  " start=" + startOfNextWordX + " x=" + position.getX() );
 	                }
                 }
-    
+
                 if (positionY >= maxYForLine) {
                 	maxYForLine = positionY;
                     //lastBaselineFontSize = position.getFontSize();
                 }
-    
+
                 // RDD - endX is what PDF considers to be the x coordinate of the
                 // end position of the text.  We use it in computing our metrics below.
                 endOfLastTextX = positionX + positionWidth;
                 //endOfLastTextY = positionY;
-    
+
                 if (characterValue != null)
                 {
                     writeCharacters( position );
@@ -549,14 +544,14 @@
                     //Position.getString() is null so not writing anything
                 }
                 maxHeightForLine = Math.max( maxHeightForLine, positionHeight );
-                minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); 
+                minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
                 lastPosition = position;
                 //lastHeightForLine = position.getHeight();
                 lastWordSpacing = wordSpacing;
             }
             endParagraph();
         }
-        
+
 
         // RDD - newline at end of flush - required for end of page (so that the top
         // of the next page starts on its own line.
@@ -651,7 +646,7 @@
                 if( charCharacter != null &&
                     //charCharacter.equals( textCharacter ) &&
                     within( charX, textX, tolerance ) &&
-                    within( charY, 
+                    within( charY,
                     		textY,
                             tolerance ) )
                 {