You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/06/20 15:25:25 UTC

svn commit: r956354 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Author: lehmi
Date: Sun Jun 20 13:25:25 2010
New Revision: 956354

URL: http://svn.apache.org/viewvc?rev=956354&view=rev
Log:
PDFBOX-726: PDFTextStripper overrides resetEngine and resets the current page number within that method

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=956354&r1=956353&r2=956354&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sun Jun 20 13:25:25 2010
@@ -35,6 +35,7 @@ import org.apache.pdfbox.exceptions.Inva
 import org.apache.pdfbox.exceptions.WrappedIOException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
@@ -72,7 +73,7 @@ public class PDFTextStripper extends PDF
     private float spacingTolerance = .5f;
     private float averageCharTolerance = .3f;
 
-    private List pageArticles = null;
+    private List<PDThreadBead> pageArticles = null;
     /**
      * The charactersByArticle is used to extract text by article divisions.  For example
      * a PDF that has two columns like a newspaper, we want to extract the first column and
@@ -88,9 +89,9 @@ public class PDFTextStripper extends PDF
      *
      * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
      */
-    protected Vector charactersByArticle = new Vector();
+    protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>();
 
-    private Map characterListMapping = new HashMap();
+    private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
 
     /**
      * The platforms lineseparator.
@@ -206,6 +207,15 @@ public class PDFTextStripper extends PDF
     }
 
     /**
+     * {@inheritDoc}
+     */
+    public void resetEngine()
+    {
+        super.resetEngine();
+        currentPageNo = 0;
+    }
+    
+    /**
      * This will take a PDDocument and write the text of that document to the print writer.
      *
      * @param doc The document to get the data from.
@@ -216,8 +226,6 @@ public class PDFTextStripper extends PDF
     public void writeText( PDDocument doc, Writer outputStream ) throws IOException
     {
         resetEngine();
-
-        currentPageNo = 0;
         document = doc;
         output = outputStream;
         startDocument(document);
@@ -255,7 +263,7 @@ public class PDFTextStripper extends PDF
      *
      * @throws IOException If there is an error parsing the text.
      */
-    protected void processPages( List pages ) throws IOException
+    protected void processPages( List<COSObjectable> pages ) throws IOException
     {
         if( startBookmark != null )
         {
@@ -279,7 +287,7 @@ public class PDFTextStripper extends PDF
         }
 
 
-        Iterator pageIter = pages.iterator();
+        Iterator<COSObjectable> pageIter = pages.iterator();
         while( pageIter.hasNext() )
         {
             PDPage nextPage = (PDPage)pageIter.next();
@@ -293,7 +301,7 @@ public class PDFTextStripper extends PDF
         }
     }
 
-    private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
+    private int getPageNumber( PDOutlineItem bookmark, List<COSObjectable> allPages ) throws IOException
     {
         int pageNumber = -1;
         PDPage page = bookmark.findDestinationPage( document );
@@ -355,11 +363,11 @@ public class PDFTextStripper extends PDF
             {
                 if( numberOfArticleSections < originalSize )
                 {
-                    ((List)charactersByArticle.get( i )).clear();
+                    ((List<TextPosition>)charactersByArticle.get( i )).clear();
                 }
                 else
                 {
-                    charactersByArticle.set( i, new ArrayList() );
+                    charactersByArticle.set( i, new ArrayList<TextPosition>() );
                 }
             }
 
@@ -463,14 +471,14 @@ public class PDFTextStripper extends PDF
 
         for( int i = 0; i < charactersByArticle.size(); i++)
         {
-            List textList = (List)charactersByArticle.get( i );
+            List<TextPosition> textList = (List<TextPosition>)charactersByArticle.get( i );
             if( sortByPosition )
             {
                 TextPositionComparator comparator = new TextPositionComparator();
                 Collections.sort( textList, comparator );
             }
 
-            Iterator textIter = textList.iterator();
+            Iterator<TextPosition> textIter = textList.iterator();
 
             /* Before we can display the text, we need to do some normalizing.
              * Arabic and Hebrew text is right to left and is typically stored
@@ -492,7 +500,7 @@ public class PDFTextStripper extends PDF
 
             while( textIter.hasNext() )
             {
-                TextPosition position = (TextPosition)textIter.next();
+                TextPosition position = textIter.next();
                 String stringValue = position.getCharacter();
 
                 for (int a = 0; a < stringValue.length(); a++) 
@@ -549,7 +557,7 @@ public class PDFTextStripper extends PDF
             float previousAveCharWidth = -1;
             while( textIter.hasNext() )
             {
-                TextPosition position = (TextPosition)textIter.next();
+                TextPosition position = textIter.next();
                 String characterValue = position.getCharacter();
 
                 //Resets the average character width when we see a change in font 
@@ -816,10 +824,10 @@ public class PDFTextStripper extends PDF
             String textCharacter = text.getCharacter();
             float textX = text.getX();
             float textY = text.getY();
-            List sameTextCharacters = (List)characterListMapping.get( textCharacter );
+            List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
             if( sameTextCharacters == null )
             {
-                sameTextCharacters = new ArrayList();
+                sameTextCharacters = new ArrayList<TextPosition>();
                 characterListMapping.put( textCharacter, sameTextCharacters );
             }
 
@@ -838,7 +846,7 @@ public class PDFTextStripper extends PDF
             float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
             for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
             {
-                TextPosition character = (TextPosition)sameTextCharacters.get( i );
+                TextPosition character = sameTextCharacters.get( i );
                 String charCharacter = character.getCharacter();
                 float charX = character.getX();
                 float charY = character.getY();
@@ -932,7 +940,7 @@ public class PDFTextStripper extends PDF
                 articleDivisionIndex = charactersByArticle.size()-1;
             }
 
-            List textList = (List) charactersByArticle.get( articleDivisionIndex );
+            List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );
 
             /* In the wild, some PDF encoded documents put diacritics (accents on
              * top of characters) into a separate Tj element.  When displaying them
@@ -1119,7 +1127,7 @@ public class PDFTextStripper extends PDF
      *
      * @return A double List of TextPositions for all text strings on the page.
      */
-    protected List getCharactersByArticle()
+    protected Vector<List<TextPosition>> getCharactersByArticle()
     {
         return charactersByArticle;
     }