You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/05/10 16:04:53 UTC
svn commit: r1678560 - in /pdfbox/trunk/pdfbox/src:
main/java/org/apache/pdfbox/text/PDFTextStripper.java
test/java/org/apache/pdfbox/text/TestTextStripper.java
Author: tilman
Date: Sun May 10 14:04:52 2015
New Revision: 1678560
URL: http://svn.apache.org/r1678560
Log:
PDFBOX-2792: fix regressions in text stripping where page intervals are controlled by bookmarks
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1678560&r1=1678559&r2=1678560&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Sun May 10 14:04:52 2015
@@ -139,9 +139,12 @@ public class PDFTextStripper extends PDF
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
+
+ // 1-based bookmark pages
private int startBookmarkPageNumber = -1;
- private PDOutlineItem endBookmark = null;
private int endBookmarkPageNumber = -1;
+
+ private PDOutlineItem endBookmark = null;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
@@ -220,8 +223,6 @@ public class PDFTextStripper extends PDF
{
characterListMapping.clear();
}
- startBookmark = null;
- endBookmark = null;
}
/**
@@ -258,13 +259,34 @@ public class PDFTextStripper extends PDF
*/
protected void processPages(PDPageTree pages) throws IOException
{
- PDPage startPage = startBookmark == null ? null :
- startBookmark.findDestinationPage(document);
+ PDPageTree pagesTree = document.getPages();
+
+ PDPage startBookmarkPage = startBookmark == null ? null
+ : startBookmark.findDestinationPage(document);
+ if (startBookmarkPage != null)
+ {
+ startBookmarkPageNumber = pagesTree.indexOf(startBookmarkPage) + 1;
+ }
+ else
+ {
+ // -1 = undefined
+ startBookmarkPageNumber = -1;
+ }
- PDPage endPage = endBookmark == null ? null :
- endBookmark.findDestinationPage(document);
+ PDPage endBookmarkPage = endBookmark == null ? null
+ : endBookmark.findDestinationPage(document);
+ if (endBookmarkPage != null)
+ {
+ endBookmarkPageNumber = pagesTree.indexOf(endBookmarkPage) + 1;
+ }
+ else
+ {
+ // -1 = undefined
+ endBookmarkPageNumber = -1;
+ }
- if (startPage != null && endPage != null &&
+ if (startBookmarkPageNumber == -1 && startBookmark != null &&
+ endBookmarkPageNumber == -1 && endBookmark != null &&
startBookmark.getCOSObject() == endBookmark.getCOSObject())
{
// this is a special case where both the start and end bookmark
@@ -954,7 +976,7 @@ public class PDFTextStripper extends PDF
/**
* This will set the first page to be extracted by this class.
*
- * @param startPageValue New value of property startPage.
+ * @param startPageValue New value of 1-based startPage property.
*/
public void setStartPage(int startPageValue)
{
@@ -977,7 +999,7 @@ public class PDFTextStripper extends PDF
/**
* This will set the last page to be extracted by this class.
*
- * @param endPageValue New value of property endPage.
+ * @param endPageValue New value of 1-based endPage property.
*/
public void setEndPage(int endPageValue)
{
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java?rev=1678560&r1=1678559&r2=1678560&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java Sun May 10 14:04:52 2015
@@ -26,6 +26,7 @@ import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.util.Iterator;
import junit.framework.Test;
import junit.framework.TestCase;
@@ -34,6 +35,10 @@ import junit.framework.TestSuite;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.TestPDPageTree;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
/**
@@ -331,6 +336,89 @@ public class TestTextStripper extends Te
document.close();
}
}
+
+ private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException
+ {
+ PDPageDestination pageDest = (PDPageDestination) oi.getDestination();
+
+ // two methods to get the page index, the result should be identical!
+ int indexOfPage = doc.getPages().indexOf(oi.findDestinationPage(doc));
+ int pageNum = pageDest.retrievePageNumber();
+ assertEquals(indexOfPage, pageNum);
+
+ return pageNum;
+ }
+
+ /**
+ * Test whether stripping controlled by outline items works properly. The test file has 4
+ * outline items at the top level, that point to 0-based pages 0, 2, 3 and 4. We are testing
+ * text stripping by outlines pointing to 0-based pages 2 and 3, and also text stripping of the
+ * 0-based page 2. The test makes sure that the output is different to a complete strip, not
+ * empty, different to each other when different bookmark intervals are used, but identical from
+ * bookmark intervals to strips with page intervals.
+ *
+ * @throws IOException
+ */
+ public void testStripByOutlineItems() throws IOException
+ {
+ PDDocument doc = PDDocument.load(TestPDPageTree.class.getResourceAsStream("with_outline.pdf"));
+ PDDocumentOutline outline = doc.getDocumentCatalog().getDocumentOutline();
+ Iterable<PDOutlineItem> children = outline.children();
+ Iterator<PDOutlineItem> it = children.iterator();
+ PDOutlineItem oi0 = it.next();
+ PDOutlineItem oi2 = it.next();
+ PDOutlineItem oi3 = it.next();
+ PDOutlineItem oi4 = it.next();
+
+ assertEquals(0, findOutlineItemDestPageNum(doc, oi0));
+ assertEquals(2, findOutlineItemDestPageNum(doc, oi2));
+ assertEquals(3, findOutlineItemDestPageNum(doc, oi3));
+ assertEquals(4, findOutlineItemDestPageNum(doc, oi4));
+
+ String textFull = stripper.getText(doc);
+ assertFalse(textFull.isEmpty());
+
+ // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+ // by their bookmarks
+ stripper.setStartBookmark(oi2);
+ stripper.setEndBookmark(oi3);
+ String textoi23 = stripper.getText(doc);
+ assertFalse(textoi23.isEmpty());
+ assertFalse(textoi23.equals(textFull));
+
+ // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+ // by their page numbers
+ stripper.setStartBookmark(null);
+ stripper.setEndBookmark(null);
+ stripper.setStartPage(3);
+ stripper.setEndPage(4);
+ String textp34 = stripper.getText(doc);
+ assertFalse(textp34.isEmpty());
+ assertFalse(textoi23.equals(textFull));
+ assertTrue(textoi23.equals(textp34));
+
+
+ // this should grab 0-based page 2, i.e. 1-based page 3
+ // by the bookmark
+ stripper.setStartBookmark(oi2);
+ stripper.setEndBookmark(oi2);
+ String textoi2 = stripper.getText(doc);
+ assertFalse(textoi2.isEmpty());
+ assertFalse(textoi2.equals(textoi23));
+ assertFalse(textoi23.equals(textFull));
+
+ // this should grab 0-based page 2, i.e. 1-based page 3
+ // by the page number
+ stripper.setStartBookmark(null);
+ stripper.setEndBookmark(null);
+ stripper.setStartPage(3);
+ stripper.setEndPage(3);
+ String textp3 = stripper.getText(doc);
+ assertFalse(textp3.isEmpty());
+ assertFalse(textp3.equals(textp34));
+ assertFalse(textoi23.equals(textFull));
+ assertTrue(textoi2.equals(textp3));
+ }
/**
* Process each file in the specified directory.
@@ -361,8 +449,7 @@ public class TestTextStripper extends Te
*
* @throws Exception when there is an exception
*/
- public void testExtract()
- throws Exception
+ public void testExtract() throws Exception
{
String filename = System.getProperty("org.apache.pdfbox.util.TextStripper.file");
File inDir = new File("src/test/resources/input");