You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/11/17 19:23:03 UTC
svn commit: r1714845 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Author: tilman
Date: Tue Nov 17 18:23:03 2015
New Revision: 1714845
URL: http://svn.apache.org/viewvc?rev=1714845&view=rev
Log:
PDFBOX-3110: optimize bead rectangle handling
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1714845&r1=1714844&r2=1714845&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Tue Nov 17 18:23:03 2015
@@ -165,7 +165,7 @@ public class PDFTextStripper extends PDF
private float spacingTolerance = .5f;
private float averageCharTolerance = .3f;
- private List<PDThreadBead> pageArticles = null;
+ private List<PDRectangle> beadRectangles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like
@@ -347,11 +347,44 @@ public class PDFTextStripper extends PDF
&& (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
{
startPage(page);
- pageArticles = page.getThreadBeads();
- int numberOfArticleSections = 1 + pageArticles.size() * 2;
- if (!shouldSeparateByBeads)
+
+ int numberOfArticleSections = 1;
+ if (shouldSeparateByBeads)
{
- numberOfArticleSections = 1;
+ beadRectangles = new ArrayList<PDRectangle>();
+ for (PDThreadBead bead : page.getThreadBeads())
+ {
+ if (bead == null)
+ {
+ // can't skip, because of null entry handling in processTextPosition()
+ beadRectangles.add(null);
+ continue;
+ }
+
+ PDRectangle rect = bead.getRectangle();
+
+ // bead rectangle is in PDF coordinates (y=0 is bottom),
+ // glyphs are in image coordinates (y=0 is top),
+ // so we must flip
+ PDRectangle mediaBox = page.getMediaBox();
+ float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
+ float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
+ rect.setLowerLeftY(lowerLeftY);
+ rect.setUpperRightY(upperRightY);
+
+ // adjust for cropbox
+ PDRectangle cropBox = page.getCropBox();
+ if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
+ {
+ rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
+ rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
+ rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
+ rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
+ }
+
+ beadRectangles.add(rect);
+ }
+ numberOfArticleSections += beadRectangles.size() * 2;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize(numberOfArticleSections);
@@ -814,33 +847,11 @@ public class PDFTextStripper extends PDF
float y = text.getY();
if (shouldSeparateByBeads)
{
- for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++)
+ for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
{
- PDThreadBead bead = pageArticles.get(i);
- if (bead != null)
+ PDRectangle rect = beadRectangles.get(i);
+ if (rect != null)
{
- PDRectangle rect = bead.getRectangle();
-
- // bead rectangle is in PDF coordinates (y=0 is bottom),
- // glyphs are in image coordinates (y=0 is top),
- // so we must flip
- PDPage pdPage = getCurrentPage();
- PDRectangle mediaBox = pdPage.getMediaBox();
- float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
- float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
- rect.setLowerLeftY(lowerLeftY);
- rect.setUpperRightY(upperRightY);
-
- // adjust for cropbox
- PDRectangle cropBox = pdPage.getCropBox();
- if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
- {
- rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
- rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
- rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
- rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
- }
-
if (rect.contains(x, y))
{
foundArticleDivisionIndex = i * 2 + 1;