You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/11/17 20:12:00 UTC
svn commit: r1714852 - in /pdfbox/branches/1.8/pdfbox/src:
main/java/org/apache/pdfbox/util/ test/resources/input/
Author: tilman
Date: Tue Nov 17 19:12:00 2015
New Revision: 1714852
URL: http://svn.apache.org/viewvc?rev=1714852&view=rev
Log:
PDFBOX-3110: fix handling of beads as in 2.0 + test files by Maruan Sahyoun
Added:
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf
- copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf-sorted.txt
- copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf-sorted.txt
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf.txt
- copied unchanged from r1714846, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads-cropbox.pdf.txt
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf
- copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf-sorted.txt
- copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf-sorted.txt
pdfbox/branches/1.8/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf.txt
- copied unchanged from r1714630, pdfbox/trunk/pdfbox/src/test/resources/input/PDFBOX-3110-poems-beads.pdf.txt
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1714852&r1=1714851&r2=1714852&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Nov 17 19:12:00 2015
@@ -166,7 +166,8 @@ public class PDFTextStripper extends PDF
private float spacingTolerance = .5f;
private float averageCharTolerance = .3f;
- private List<PDThreadBead> pageArticles = null;
+ private List<PDRectangle> beadRectangles = null;
+
/**
* The charactersByArticle is used to extract text by article divisions. For example
* a PDF that has two columns like a newspaper, we want to extract the first column and
@@ -435,11 +436,44 @@ public class PDFTextStripper extends PDF
(endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
{
startPage( page );
- pageArticles = page.getThreadBeads();
- int numberOfArticleSections = 1 + pageArticles.size() * 2;
- if( !shouldSeparateByBeads )
+
+ int numberOfArticleSections = 1;
+ if (shouldSeparateByBeads)
{
- numberOfArticleSections = 1;
+ beadRectangles = new ArrayList<PDRectangle>();
+ for (PDThreadBead bead : page.getThreadBeads())
+ {
+ if (bead == null)
+ {
+ // can't skip, because of null entry handling in processTextPosition()
+ beadRectangles.add(null);
+ continue;
+ }
+
+ PDRectangle rect = bead.getRectangle();
+
+ // bead rectangle is in PDF coordinates (y=0 is bottom),
+ // glyphs are in image coordinates (y=0 is top),
+ // so we must flip
+ PDRectangle mediaBox = page.findMediaBox();
+ float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
+ float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
+ rect.setLowerLeftY(lowerLeftY);
+ rect.setUpperRightY(upperRightY);
+
+ // adjust for cropbox
+ PDRectangle cropBox = page.findCropBox();
+ if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
+ {
+ rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
+ rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
+ rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
+ rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
+ }
+
+ beadRectangles.add(rect);
+ }
+ numberOfArticleSections += beadRectangles.size() * 2;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize( numberOfArticleSections );
@@ -967,14 +1001,13 @@ public class PDFTextStripper extends PDF
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
- if( shouldSeparateByBeads )
+ if (shouldSeparateByBeads)
{
- for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
+ for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
{
- PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
- if( bead != null )
+ PDRectangle rect = beadRectangles.get(i);
+ if( rect != null )
{
- PDRectangle rect = bead.getRectangle();
if( rect.contains( x, y ) )
{
foundArticleDivisionIndex = i*2+1;