You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/11/17 20:12:00 UTC

svn commit: r1714852 - in /pdfbox/branches/1.8/pdfbox/src: main/java/org/apache/pdfbox/util/ test/resources/input/

Author: tilman
Date: Tue Nov 17 19:12:00 2015
New Revision: 1714852

URL: http://svn.apache.org/viewvc?rev=1714852&view=rev
Log:
PDFBOX-3110: fix handling of beads as in 2.0 + test files by Maruan Sahyoun

Added:
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf
      - copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf-sorted.txt
      - copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf-sorted.txt
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf.txt
      - copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf.txt
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf
      - copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf-sorted.txt
      - copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf-sorted.txt
    pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf.txt
      - copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf.txt
Modified:
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1714852&r1=1714851&r2=1714852&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Nov 17 19:12:00 2015
@@ -166,7 +166,8 @@ public class PDFTextStripper extends PDF
     private float spacingTolerance = .5f;
     private float averageCharTolerance = .3f;
 
-    private List<PDThreadBead> pageArticles = null;
+    private List<PDRectangle> beadRectangles = null;
+    
     /**
      * The charactersByArticle is used to extract text by article divisions.  For example
      * a PDF that has two columns like a newspaper, we want to extract the first column and
@@ -435,11 +436,44 @@ public class PDFTextStripper extends PDF
                 (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
         {
             startPage( page );
-            pageArticles = page.getThreadBeads();
-            int numberOfArticleSections = 1 + pageArticles.size() * 2;
-            if( !shouldSeparateByBeads )
+
+            int numberOfArticleSections = 1;
+            if (shouldSeparateByBeads)
             {
-                numberOfArticleSections = 1;
+                beadRectangles = new ArrayList<PDRectangle>();
+                for (PDThreadBead bead : page.getThreadBeads())
+                {
+                    if (bead == null)
+                    {
+                        // can't skip, because of null entry handling in processTextPosition()
+                        beadRectangles.add(null);
+                        continue;
+                    }
+
+                    PDRectangle rect = bead.getRectangle();
+
+                    // bead rectangle is in PDF coordinates (y=0 is bottom), 
+                    // glyphs are in image coordinates (y=0 is top),
+                    // so we must flip
+                    PDRectangle mediaBox = page.findMediaBox();
+                    float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
+                    float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
+                    rect.setLowerLeftY(lowerLeftY);
+                    rect.setUpperRightY(upperRightY);
+
+                    // adjust for cropbox
+                    PDRectangle cropBox = page.findCropBox();
+                    if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
+                    {
+                        rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
+                        rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
+                        rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
+                        rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
+                    }
+
+                    beadRectangles.add(rect);
+                }
+                numberOfArticleSections += beadRectangles.size() * 2;
             }
             int originalSize = charactersByArticle.size();
             charactersByArticle.setSize( numberOfArticleSections );
@@ -967,14 +1001,13 @@ public class PDFTextStripper extends PDF
             int notFoundButFirstAboveArticleDivisionIndex = -1;
             float x = text.getX();
             float y = text.getY();
-            if( shouldSeparateByBeads )
+            if (shouldSeparateByBeads)
             {
-                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
+                for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
                 {
-                    PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
-                    if( bead != null )
+                    PDRectangle rect = beadRectangles.get(i);
+                    if( rect != null )
                     {
-                        PDRectangle rect = bead.getRectangle();
                         if( rect.contains( x, y ) )
                         {
                             foundArticleDivisionIndex = i*2+1;