You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 19:03:51 UTC
svn commit: r1416032 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Sat Dec 1 18:03:50 2012
New Revision: 1416032
URL: http://svn.apache.org/viewvc?rev=1416032&view=rev
Log:
TIKA-1035: extract text from PDF bookmarks
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec 1 18:03:50 2012
@@ -7,7 +7,8 @@ Release 1.3 - Current Development
(TIKA-956, TIKA-1019). Embedded Wordpad/RTF documents are now
recognized (TIKA-982).
- * PDF: Text from pop-up annotations is now extracted (TIKA-981)
+ * PDF: Text from pop-up annotations is now extracted (TIKA-981).
+ Text from bookmarks is now extracted (TIKA-1035).
* PKCS7: Detached signatures no longer through NullPointerException
(TIKA-986).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sat Dec 1 18:03:50 2012
@@ -21,11 +21,14 @@ import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOExceptionWithCause;
@@ -61,10 +64,12 @@ class PDF2XHTML extends PDFTextStripper
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
- // key methods to output to the given content handler.
- new PDF2XHTML(handler, metadata,
- extractAnnotationText, enableAutoSpace,
- suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
+ // key methods to output to the given content
+ // handler.
+ PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, metadata,
+ extractAnnotationText, enableAutoSpace,
+ suppressDuplicateOverlappingText, sortByPosition);
+ pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@@ -75,6 +80,10 @@ class PDF2XHTML extends PDFTextStripper
public void close() {
}
});
+
+ // Also extract text for any bookmarks:
+ pdf2XHTML.extractBookmarkText();
+
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
@@ -106,6 +115,26 @@ class PDF2XHTML extends PDFTextStripper
setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
}
+ void extractBookmarkText() throws SAXException {
+ PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+ if (outline != null) {
+ handler.newline();
+ extractBookmarkText(outline, "");
+ }
+ }
+
+ void extractBookmarkText(PDOutlineNode bookmark, String indent) throws SAXException {
+ PDOutlineItem current = bookmark.getFirstChild();
+ while (current != null) {
+ handler.characters(indent);
+ handler.characters(current.getTitle());
+ handler.newline();
+ // Recurse:
+ extractBookmarkText(current, indent + " ");
+ current = current.getNextSibling();
+ }
+ }
+
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
@@ -261,7 +290,7 @@ class PDF2XHTML extends PDFTextStripper
@Override
protected void writeLineSeparator() throws IOException {
try {
- handler.characters("\n");
+ handler.newline();
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a newline character", e);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1416032&r1=1416031&r2=1416032&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sat Dec 1 18:03:50 2012
@@ -17,12 +17,6 @@
package org.apache.tika.parser.pdf;
import java.io.InputStream;
-import java.io.StringWriter;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
@@ -451,4 +445,10 @@ public class PDFParserTest extends TikaT
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
}
+
+ // TIKA-1035
+ public void testBookmarks() throws Exception {
+ String xml = getXML("testPDF_bookmarks.pdf").xml;
+ assertContains("Denmark bookmark is here", xml);
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf?rev=1416032&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_bookmarks.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf