You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/05/10 16:25:38 UTC
svn commit: r1678561 - in /pdfbox/branches/1.8/pdfbox/src:
main/java/org/apache/pdfbox/util/PDFTextStripper.java
test/java/org/apache/pdfbox/util/TestTextStripper.java
test/resources/org/apache/pdfbox/pdmodel/with_outline.pdf
Author: tilman
Date: Sun May 10 14:25:38 2015
New Revision: 1678561
URL: http://svn.apache.org/r1678561
Log:
PDFBOX-2792: fix regressions in text stripping where page intervals are controlled by bookmarks
Added:
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdmodel/with_outline.pdf (with props)
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1678561&r1=1678560&r2=1678561&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Sun May 10 14:25:38 2015
@@ -316,8 +316,6 @@ public class PDFTextStripper extends PDF
{
characterListMapping.clear();
}
- startBookmark = null;
- endBookmark = null;
}
/**
Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1678561&r1=1678560&r2=1678561&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Sun May 10 14:25:38 2015
@@ -26,6 +26,7 @@ import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.util.Iterator;
import junit.framework.Test;
import junit.framework.TestCase;
@@ -34,6 +35,10 @@ import junit.framework.TestSuite;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.TestPDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
/**
@@ -392,6 +397,89 @@ public class TestTextStripper extends Te
}
}
+
+ private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException
+ {
+ PDPageDestination pageDest = (PDPageDestination) oi.getDestination();
+
+ // two methods to get the page index, the result should be identical!
+ int indexOfPage = doc.getDocumentCatalog().getAllPages().indexOf(oi.findDestinationPage(doc));
+ int pageNum = pageDest.retrieveDestPageNumber();
+ assertEquals(indexOfPage, pageNum);
+
+ return pageNum;
+ }
+
+ /**
+ * Test whether stripping controlled by outline items works properly. The test file has 4
+ * outline items at the top level, that point to 0-based pages 0, 2, 3 and 4. We are testing
+ * text stripping by outlines pointing to 0-based pages 2 and 3, and also text stripping of the
+ * 0-based page 2. The test makes sure that the output is different to a complete strip, not
+ * empty, different to each other when different bookmark intervals are used, but identical from
+ * bookmark intervals to strips with page intervals.
+ *
+ * @throws IOException
+ */
+ public void testStripByOutlineItems() throws IOException
+ {
+ PDDocument doc = PDDocument.load(TestPDDocumentCatalog.class.getResourceAsStream("with_outline.pdf"));
+ PDDocumentOutline outline = doc.getDocumentCatalog().getDocumentOutline();
+ PDOutlineItem oi0 = outline.getFirstChild();
+ PDOutlineItem oi2 = oi0.getNextSibling();
+ PDOutlineItem oi3 = oi2.getNextSibling();
+ PDOutlineItem oi4 = oi3.getNextSibling();
+
+ assertEquals(0, findOutlineItemDestPageNum(doc, oi0));
+ assertEquals(2, findOutlineItemDestPageNum(doc, oi2));
+ assertEquals(3, findOutlineItemDestPageNum(doc, oi3));
+ assertEquals(4, findOutlineItemDestPageNum(doc, oi4));
+
+ String textFull = stripper.getText(doc);
+ assertFalse(textFull.isEmpty());
+
+ // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+ // by their bookmarks
+ stripper.setStartBookmark(oi2);
+ stripper.setEndBookmark(oi3);
+ String textoi23 = stripper.getText(doc);
+ assertFalse(textoi23.isEmpty());
+ assertFalse(textoi23.equals(textFull));
+
+ // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+ // by their page numbers
+ stripper.setStartBookmark(null);
+ stripper.setEndBookmark(null);
+ stripper.setStartPage(3);
+ stripper.setEndPage(4);
+ String textp34 = stripper.getText(doc);
+ assertFalse(textp34.isEmpty());
+ assertFalse(textoi23.equals(textFull));
+ assertTrue(textoi23.equals(textp34));
+
+
+ // this should grab 0-based page 2, i.e. 1-based page 3
+ // by the bookmark
+ stripper.setStartBookmark(oi2);
+ stripper.setEndBookmark(oi2);
+ String textoi2 = stripper.getText(doc);
+ assertFalse(textoi2.isEmpty());
+ assertFalse(textoi2.equals(textoi23));
+ assertFalse(textoi23.equals(textFull));
+
+ // this should grab 0-based page 2, i.e. 1-based page 3
+ // by the page number
+ stripper.setStartBookmark(null);
+ stripper.setEndBookmark(null);
+ stripper.setStartPage(3);
+ stripper.setEndPage(3);
+ String textp3 = stripper.getText(doc);
+ assertFalse(textp3.isEmpty());
+ assertFalse(textp3.equals(textp34));
+ assertFalse(textoi23.equals(textFull));
+ assertTrue(textoi2.equals(textp3));
+ }
+
+
/**
* Set the tests in the suite for this test class.
*
Added: pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdmodel/with_outline.pdf
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdmodel/with_outline.pdf?rev=1678561&view=auto
==============================================================================
Binary file - no diff available.
Propchange: pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdmodel/with_outline.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf