You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/11/17 19:23:03 UTC
svn commit: r1714845 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Author: tilman
Date: Tue Nov 17 18:23:03 2015
New Revision: 1714845

URL: http://svn.apache.org/viewvc?rev=1714845&view=rev
Log:
PDFBOX-3110: optimize bead rectangle handling

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1714845&r1=1714844&r2=1714845&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Tue Nov 17 18:23:03 2015
@@ -165,7 +165,7 @@ public class PDFTextStripper extends PDF
     private float spacingTolerance = .5f;
     private float averageCharTolerance = .3f;
 
-    private List<PDThreadBead> pageArticles = null;
+    private List<PDRectangle> beadRectangles = null;
 
     /**
      * The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like
@@ -347,11 +347,44 @@ public class PDFTextStripper extends PDF
                 && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
         {
             startPage(page);
-            pageArticles = page.getThreadBeads();
-            int numberOfArticleSections = 1 + pageArticles.size() * 2;
-            if (!shouldSeparateByBeads)
+
+            int numberOfArticleSections = 1;
+            if (shouldSeparateByBeads)
             {
-                numberOfArticleSections = 1;
+                beadRectangles = new ArrayList<PDRectangle>();
+                for (PDThreadBead bead : page.getThreadBeads())
+                {
+                    if (bead == null)
+                    {
+                        // can't skip, because of null entry handling in processTextPosition()
+                        beadRectangles.add(null);
+                        continue;
+                    }
+                        
+                    PDRectangle rect = bead.getRectangle();
+                    
+                    // bead rectangle is in PDF coordinates (y=0 is bottom), 
+                    // glyphs are in image coordinates (y=0 is top),
+                    // so we must flip
+                    PDRectangle mediaBox = page.getMediaBox();
+                    float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
+                    float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
+                    rect.setLowerLeftY(lowerLeftY);
+                    rect.setUpperRightY(upperRightY);
+
+                    // adjust for cropbox
+                    PDRectangle cropBox = page.getCropBox();
+                    if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
+                    {
+                        rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
+                        rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
+                        rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
+                        rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
+                    }
+                    
+                    beadRectangles.add(rect);
+                }
+                numberOfArticleSections += beadRectangles.size() * 2;
             }
             int originalSize = charactersByArticle.size();
             charactersByArticle.setSize(numberOfArticleSections);
@@ -814,33 +847,11 @@ public class PDFTextStripper extends PDF
             float y = text.getY();
             if (shouldSeparateByBeads)
             {
-                for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++)
+                for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
                 {
-                    PDThreadBead bead = pageArticles.get(i);
-                    if (bead != null)
+                    PDRectangle rect = beadRectangles.get(i);
+                    if (rect != null)
                     {
-                        PDRectangle rect = bead.getRectangle();
-                        
-                        // bead rectangle is in PDF coordinates (y=0 is bottom), 
-                        // glyphs are in image coordinates (y=0 is top),
-                        // so we must flip
-                        PDPage pdPage = getCurrentPage();
-                        PDRectangle mediaBox = pdPage.getMediaBox();
-                        float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
-                        float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
-                        rect.setLowerLeftY(lowerLeftY);
-                        rect.setUpperRightY(upperRightY);
-                        
-                        // adjust for cropbox
-                        PDRectangle cropBox = pdPage.getCropBox();
-                        if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
-                        {
-                            rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
-                            rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
-                            rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
-                            rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
-                        }
-                        
                         if (rect.contains(x, y))
                         {
                             foundArticleDivisionIndex = i * 2 + 1;