You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 19:03:51 UTC
svn commit: r1416032 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Sat Dec  1 18:03:50 2012
New Revision: 1416032

URL: http://svn.apache.org/viewvc?rev=1416032&view=rev
Log:
TIKA-1035: extract text from PDF bookmarks

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec  1 18:03:50 2012
@@ -7,7 +7,8 @@ Release 1.3 - Current Development
     (TIKA-956, TIKA-1019).  Embedded Wordpad/RTF documents are now
     recognized (TIKA-982).
 
-  * PDF: Text from pop-up annotations is now extracted (TIKA-981)
+  * PDF: Text from pop-up annotations is now extracted (TIKA-981).
+    Text from bookmarks is now extracted (TIKA-1035).
 
   * PKCS7: Detached signatures no longer through NullPointerException
     (TIKA-986).

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sat Dec  1 18:03:50 2012
@@ -21,11 +21,14 @@ import java.io.Writer;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.util.TextPosition;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOExceptionWithCause;
@@ -61,10 +64,12 @@ class PDF2XHTML extends PDFTextStripper 
             throws SAXException, TikaException {
         try {
             // Extract text using a dummy Writer as we override the
-            // key methods to output to the given content handler.
-            new PDF2XHTML(handler, metadata,
-                          extractAnnotationText, enableAutoSpace,
-                          suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
+            // key methods to output to the given content
+            // handler.
+            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, metadata,
+                                                extractAnnotationText, enableAutoSpace,
+                                                suppressDuplicateOverlappingText, sortByPosition);
+            pdf2XHTML.writeText(document, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
                 }
@@ -75,6 +80,10 @@ class PDF2XHTML extends PDFTextStripper 
                 public void close() {
                 }
             });
+
+            // Also extract text for any bookmarks:
+            pdf2XHTML.extractBookmarkText();
+
         } catch (IOException e) {
             if (e.getCause() instanceof SAXException) {
                 throw (SAXException) e.getCause();
@@ -106,6 +115,26 @@ class PDF2XHTML extends PDFTextStripper 
         setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
     }
 
+    void extractBookmarkText() throws SAXException {
+        PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+        if (outline != null) {
+            handler.newline();
+            extractBookmarkText(outline, "");
+        }
+    }
+
+    void extractBookmarkText(PDOutlineNode bookmark, String indent) throws SAXException {
+        PDOutlineItem current = bookmark.getFirstChild();
+        while (current != null) {
+          handler.characters(indent);
+          handler.characters(current.getTitle());
+          handler.newline();
+          // Recurse:
+          extractBookmarkText(current, indent + "    ");
+          current = current.getNextSibling();
+        }
+    }
+
     @Override
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
@@ -261,7 +290,7 @@ class PDF2XHTML extends PDFTextStripper 
     @Override
     protected void writeLineSeparator() throws IOException {
         try {
-            handler.characters("\n");
+            handler.newline();
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a newline character", e);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sat Dec  1 18:03:50 2012
@@ -17,12 +17,6 @@
 package org.apache.tika.parser.pdf;
 
 import java.io.InputStream;
-import java.io.StringWriter;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
@@ -451,4 +445,10 @@ public class PDFParserTest extends TikaT
         // Column text is now interleaved:
         assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
     }
+
+    // TIKA-1035
+    public void testBookmarks() throws Exception {
+        String xml = getXML("testPDF_bookmarks.pdf").xml;
+        assertContains("Denmark bookmark is here", xml);
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf?rev=1416032&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf