You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2015/05/10 16:04:53 UTC

svn commit: r1678560 - in /pdfbox/trunk/pdfbox/src: main/java/org/apache/pdfbox/text/PDFTextStripper.java test/java/org/apache/pdfbox/text/TestTextStripper.java

Author: tilman
Date: Sun May 10 14:04:52 2015
New Revision: 1678560

URL: http://svn.apache.org/r1678560
Log:
PDFBOX-2792: fix regressions in text stripping where page intervals are controlled by bookmarks

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1678560&r1=1678559&r2=1678560&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Sun May 10 14:04:52 2015
@@ -139,9 +139,12 @@ public class PDFTextStripper extends PDF
     private int startPage = 1;
     private int endPage = Integer.MAX_VALUE;
     private PDOutlineItem startBookmark = null;
+    
+    // 1-based bookmark pages
     private int startBookmarkPageNumber = -1;
-    private PDOutlineItem endBookmark = null;
     private int endBookmarkPageNumber = -1;
+    
+    private PDOutlineItem endBookmark = null;
     private boolean suppressDuplicateOverlappingText = true;
     private boolean shouldSeparateByBeads = true;
     private boolean sortByPosition = false;
@@ -220,8 +223,6 @@ public class PDFTextStripper extends PDF
         {
             characterListMapping.clear();
         }
-        startBookmark = null;
-        endBookmark = null;
     }
     
     /**
@@ -258,13 +259,34 @@ public class PDFTextStripper extends PDF
      */
     protected void processPages(PDPageTree pages) throws IOException
     {
-        PDPage startPage = startBookmark == null ? null :
-                startBookmark.findDestinationPage(document);
+        PDPageTree pagesTree = document.getPages();
+        
+        PDPage startBookmarkPage = startBookmark == null ? null
+                           : startBookmark.findDestinationPage(document);
+        if (startBookmarkPage != null)
+        {
+            startBookmarkPageNumber = pagesTree.indexOf(startBookmarkPage) + 1;
+        }
+        else
+        {
+            // -1 = undefined
+            startBookmarkPageNumber = -1;
+        }
 
-        PDPage endPage = endBookmark == null ? null :
-                endBookmark.findDestinationPage(document);
+        PDPage endBookmarkPage = endBookmark == null ? null
+                         : endBookmark.findDestinationPage(document);
+        if (endBookmarkPage != null)
+        {
+            endBookmarkPageNumber = pagesTree.indexOf(endBookmarkPage) + 1;
+        }
+        else
+        {
+            // -1 = undefined
+            endBookmarkPageNumber = -1;
+        }
 
-        if (startPage != null && endPage != null &&
+        if (startBookmarkPageNumber == -1 && startBookmark != null &&
+            endBookmarkPageNumber == -1 && endBookmark != null &&
             startBookmark.getCOSObject() == endBookmark.getCOSObject())
         {
             // this is a special case where both the start and end bookmark
@@ -954,7 +976,7 @@ public class PDFTextStripper extends PDF
     /**
      * This will set the first page to be extracted by this class.
      *
-     * @param startPageValue New value of property startPage.
+     * @param startPageValue New value of 1-based startPage property.
      */
     public void setStartPage(int startPageValue)
     {
@@ -977,7 +999,7 @@ public class PDFTextStripper extends PDF
     /**
      * This will set the last page to be extracted by this class.
      *
-     * @param endPageValue New value of property endPage.
+     * @param endPageValue New value of 1-based endPage property.
      */
     public void setEndPage(int endPageValue)
     {

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java?rev=1678560&r1=1678559&r2=1678560&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java Sun May 10 14:04:52 2015
@@ -26,6 +26,7 @@ import java.io.LineNumberReader;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.util.Iterator;
 
 import junit.framework.Test;
 import junit.framework.TestCase;
@@ -34,6 +35,10 @@ import junit.framework.TestSuite;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.TestPDPageTree;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 
 
 /**
@@ -331,6 +336,89 @@ public class TestTextStripper extends Te
             document.close();
         }
     }
+    
+    private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException
+    {
+        PDPageDestination pageDest = (PDPageDestination) oi.getDestination();
+        
+        // two methods to get the page index, the result should be identical!
+        int indexOfPage = doc.getPages().indexOf(oi.findDestinationPage(doc));
+        int pageNum = pageDest.retrievePageNumber();
+        assertEquals(indexOfPage, pageNum);
+                
+        return pageNum;
+    }
+
+    /**
+     * Test whether stripping controlled by outline items works properly. The test file has 4
+     * outline items at the top level, that point to 0-based pages 0, 2, 3 and 4. We are testing
+     * text stripping by outlines pointing to 0-based pages 2 and 3, and also text stripping of the
+     * 0-based page 2. The test makes sure that the output is different to a complete strip, not
+     * empty, different to each other when different bookmark intervals are used, but identical from
+     * bookmark intervals to strips with page intervals.
+     *
+     * @throws IOException
+     */
+    public void testStripByOutlineItems() throws IOException
+    {
+        PDDocument doc = PDDocument.load(TestPDPageTree.class.getResourceAsStream("with_outline.pdf"));
+        PDDocumentOutline outline = doc.getDocumentCatalog().getDocumentOutline();
+        Iterable<PDOutlineItem> children = outline.children();
+        Iterator<PDOutlineItem> it = children.iterator();
+        PDOutlineItem oi0 = it.next();
+        PDOutlineItem oi2 = it.next();
+        PDOutlineItem oi3 = it.next();
+        PDOutlineItem oi4 = it.next();
+
+        assertEquals(0, findOutlineItemDestPageNum(doc, oi0));
+        assertEquals(2, findOutlineItemDestPageNum(doc, oi2));
+        assertEquals(3, findOutlineItemDestPageNum(doc, oi3));
+        assertEquals(4, findOutlineItemDestPageNum(doc, oi4));
+
+        String textFull = stripper.getText(doc);
+        assertFalse(textFull.isEmpty());
+        
+        // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+        // by their bookmarks
+        stripper.setStartBookmark(oi2);
+        stripper.setEndBookmark(oi3);
+        String textoi23 = stripper.getText(doc);
+        assertFalse(textoi23.isEmpty());
+        assertFalse(textoi23.equals(textFull));
+        
+        // this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
+        // by their page numbers
+        stripper.setStartBookmark(null);
+        stripper.setEndBookmark(null);
+        stripper.setStartPage(3);
+        stripper.setEndPage(4);
+        String textp34 = stripper.getText(doc);
+        assertFalse(textp34.isEmpty());
+        assertFalse(textoi23.equals(textFull));
+        assertTrue(textoi23.equals(textp34));
+        
+        
+        // this should grab 0-based page 2, i.e. 1-based page 3
+        // by the bookmark
+        stripper.setStartBookmark(oi2);
+        stripper.setEndBookmark(oi2);
+        String textoi2 = stripper.getText(doc);
+        assertFalse(textoi2.isEmpty());
+        assertFalse(textoi2.equals(textoi23));
+        assertFalse(textoi23.equals(textFull));
+         
+        // this should grab 0-based page 2, i.e. 1-based page 3
+        // by the page number
+        stripper.setStartBookmark(null);
+        stripper.setEndBookmark(null);
+        stripper.setStartPage(3);
+        stripper.setEndPage(3);
+        String textp3 = stripper.getText(doc);
+        assertFalse(textp3.isEmpty());
+        assertFalse(textp3.equals(textp34));
+        assertFalse(textoi23.equals(textFull));
+        assertTrue(textoi2.equals(textp3));
+    }
 
     /**
      * Process each file in the specified directory.
@@ -361,8 +449,7 @@ public class TestTextStripper extends Te
      *
      * @throws Exception when there is an exception
      */
-    public void testExtract()
-    throws Exception
+    public void testExtract() throws Exception
     {
         String filename = System.getProperty("org.apache.pdfbox.util.TextStripper.file");
         File inDir = new File("src/test/resources/input");