You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2011/12/19 15:52:56 UTC

svn commit: r1220781 - in /pdfbox/trunk/pdfbox: download/ src/main/java/org/apache/pdfbox/tika/ src/test/java/org/apache/pdfbox/tika/ src/test/resources/org/apache/pdfbox/tika/

Author: jukka
Date: Mon Dec 19 14:52:55 2011
New Revision: 1220781

URL: http://svn.apache.org/viewvc?rev=1220781&view=rev
Log:
PDFBOX-1132: Add Tika parser classes

Merge recent changes from Tika trunk. Full details below.


Revision: 1206568
Author:   Michael McCandless <mi...@apache.org>
Date:     Sat Nov 26 19:57:15 2011 +0000

    TIKA-778: fix cases where PDFParser produced too many </p> tags

Revision: 1203287
Author:   Michael McCandless <mi...@apache.org>
Date:     Thu Nov 17 17:24:26 2011 +0000

    TIKA-612: enable controlling PDFBox's setSortByPosition from PDFParser

Revision: 1197630
Author:   Michael McCandless <mi...@apache.org>
Date:     Fri Nov 4 16:28:38 2011 +0000

    TIKA-767: allow controlling whether PDFBox should try to remove overlapped duplicated text; default to disabled

Revison: 1195596
Author:  Jukka Zitting <ju...@apache.org>
Date:    Mon Oct 31 18:22:06 2011 +0000

    TIKA-565: Improved OSGi bundling

    Use central OSGiParser and OSGiDetector classes in tika-parsers to better handle issues with missing dependencies and t

Revision: 1195500
Author:   Jukka Zitting <ju...@apache.org>
Date:     Mon Oct 31 15:01:33 2011 +0000

    TIKA-565: Improved OSGi bundling

    Mark all Parser services with SCR annotations

Revision: 1186775
Author:   Michael McCandless <mi...@apache.org>
Date:     Thu Oct 20 12:55:46 2011 +0000

    TIKA-724: add option to PDFParser to control auto-space behavior

Revision: 1186771
Author:   Michael McCandless <mi...@apache.org>
Date:     Thu Oct 20 12:49:50 2011 +0000

    TIKA-738: optionally extract PDF annotations

Added:
    pdfbox/trunk/pdfbox/download/
    pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar
    pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf   (with props)
    pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf   (with props)
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java

Added: pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar?rev=1220781&view=auto
==============================================================================
Files pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar (added) and pdfbox/trunk/pdfbox/download/pcfi-2010.08.09.jar Mon Dec 19 14:52:55 2011 differ

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java Mon Dec 19 14:52:55 2011
@@ -21,6 +21,8 @@ import java.io.Writer;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
 import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.util.TextPosition;
 import org.apache.tika.exception.TikaException;
@@ -51,12 +53,16 @@ class PDF2XHTML extends PDFTextStripper 
      * @throws TikaException if the PDF document can not be processed
      */
     public static void process(
-            PDDocument document, ContentHandler handler, Metadata metadata)
+            PDDocument document, ContentHandler handler, Metadata metadata,
+            boolean extractAnnotationText, boolean enableAutoSpace,
+            boolean suppressDuplicateOverlappingText, boolean sortByPosition)
             throws SAXException, TikaException {
         try {
             // Extract text using a dummy Writer as we override the
             // key methods to output to the given content handler.
-            new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+            new PDF2XHTML(handler, metadata,
+                    extractAnnotationText, enableAutoSpace,
+                    suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
                 }
@@ -77,12 +83,25 @@ class PDF2XHTML extends PDFTextStripper 
     }
 
     private final XHTMLContentHandler handler;
+    private final boolean extractAnnotationText;
 
-    private PDF2XHTML(ContentHandler handler, Metadata metadata)
+    private PDF2XHTML(ContentHandler handler, Metadata metadata,
+            boolean extractAnnotationText, boolean enableAutoSpace,
+            boolean suppressDuplicateOverlappingText, boolean sortByPosition)
             throws IOException {
         this.handler = new XHTMLContentHandler(handler, metadata);
+        this.extractAnnotationText = extractAnnotationText;
         setForceParsing(true);
-        setSortByPosition(false);
+        setSortByPosition(sortByPosition);
+        if (enableAutoSpace) {
+            setWordSeparator(" ");
+        } else {
+            setWordSeparator("");
+        }
+        // TODO: maybe expose setting these too:
+        //setAverageCharTolerance(1.0f);
+        //setSpacingTolerance(1.0f);
+        setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
     }
 
     @Override
@@ -107,16 +126,53 @@ class PDF2XHTML extends PDFTextStripper 
     protected void startPage(PDPage page) throws IOException {
         try {
             handler.startElement("div", "class", "page");
-            handler.startElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to start a page", e);
         }
+        writeParagraphStart();
     }
 
     @Override
     protected void endPage(PDPage page) throws IOException {
+
         try {
-            handler.endElement("p");
+            writeParagraphEnd();
+            // TODO: remove once PDFBOX-1143 is fixed:
+            if (extractAnnotationText) {
+                for(Object o : page.getAnnotations()) {
+                    if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
+                        // It's a text annotation:
+                        PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
+                        String title = annot.getTitlePopup();
+                        String subject = annot.getTitlePopup();
+                        String contents = annot.getContents();
+                        // TODO: maybe also annot.getRichContents()?
+                        if (title != null || subject != null || contents != null) {
+                            handler.startElement("div", "class", "annotation");
+
+                            if (title != null) {
+                                handler.startElement("div", "class", "annotationTitle");
+                                handler.characters(title);
+                                handler.endElement("div");
+                            }
+
+                            if (subject != null) {
+                                handler.startElement("div", "class", "annotationSubject");
+                                handler.characters(subject);
+                                handler.endElement("div");
+                            }
+
+                            if (contents != null) {
+                                handler.startElement("div", "class", "annotationContents");
+                                handler.characters(contents);
+                                handler.endElement("div");
+                            }
+
+                            handler.endElement("div");
+                        }
+                    }
+                }
+            }
             handler.endElement("div");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
@@ -177,7 +233,7 @@ class PDF2XHTML extends PDFTextStripper 
     @Override
     protected void writeWordSeparator() throws IOException {
         try {
-            handler.characters(" ");
+            handler.characters(getWordSeparator());
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a space character", e);

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java Mon Dec 19 14:52:55 2011
@@ -56,6 +56,20 @@ public class PDFParser extends AbstractP
     /** Serial version UID */
     private static final long serialVersionUID = -752276948656079347L;
 
+    // True if we let PDFBox "guess" where spaces should go:
+    private boolean enableAutoSpace = true;
+
+    // True if we let PDFBox remove duplicate overlapping text:
+    private boolean suppressDuplicateOverlappingText;
+
+    // True if we extract annotation text ourselves
+    // (workaround for PDFBOX-1143):
+    private boolean extractAnnotationText = true;
+
+    // True if we should sort text tokens by position
+    // (necessary for some PDFs, but messes up other PDFs):
+    private boolean sortByPosition = false;
+
     /**
      * Metadata key for giving the document password to the parser.
      */
@@ -99,7 +113,9 @@ public class PDFParser extends AbstractP
             }
             metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
             extractMetadata(pdfDocument, metadata);
-            PDF2XHTML.process(pdfDocument, handler, metadata);
+            PDF2XHTML.process(pdfDocument, handler, metadata,
+                    extractAnnotationText, enableAutoSpace,
+                    suppressDuplicateOverlappingText, sortByPosition);
         } finally {
             pdfDocument.close();
         }
@@ -178,4 +194,70 @@ public class PDFParser extends AbstractP
             addMetadata(metadata, name, value.toString());
         }
     }
+
+    /**
+     *  If true (the default), the parser should estimate
+     *  where spaces should be inserted between words.  For
+     *  many PDFs this is necessary as they do not include
+     *  explicit whitespace characters.
+     */
+    public void setEnableAutoSpace(boolean v) {
+        enableAutoSpace = v;
+    }
+
+    /** @see #setEnableAutoSpace. */
+    public boolean getEnableAutoSpace() {
+        return enableAutoSpace;
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     */
+    public void setExtractAnnotationText(boolean v) {
+        extractAnnotationText = v;
+    }
+
+    /**
+     * If true, text in annotations will be extracted.
+     */
+    public boolean getExtractAnnotationText() {
+        return extractAnnotationText;
+    }
+
+    /**
+     *  If true, the parser should try to remove duplicated
+     *  text over the same region.  This is needed for some
+     *  PDFs that achieve bolding by re-writing the same
+     *  text in the same area.  Note that this can
+     *  slow down extraction substantially (PDFBOX-956) and
+     *  sometimes remove characters that were not in fact
+     *  duplicated (PDFBOX-1155).  By default this is disabled.
+     */
+    public void setSuppressDuplicateOverlappingText(boolean v) {
+        suppressDuplicateOverlappingText = v;
+    }
+
+    /** @see #setSuppressDuplicateOverlappingText. */
+    public boolean getSuppressDuplicateOverlappingText() {
+        return suppressDuplicateOverlappingText;
+    }
+
+    /**
+     *  If true, sort text tokens by their x/y position
+     *  before extracting text.  This may be necessary for
+     *  some PDFs (if the text tokens are not rendered "in
+     *  order"), while for other PDFs it can produce the
+     *  wrong result (for example if there are 2 columns,
+     *  the text will be interleaved).  Default is false.
+     */
+    public void setSortByPosition(boolean v) {
+        sortByPosition = v;
+    }
+
+    /** @see #setSortByPosition. */
+    public boolean getSortByPosition() {
+        return sortByPosition;
+    }
+
 }

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java?rev=1220781&r1=1220780&r2=1220781&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java Mon Dec 19 14:52:55 2011
@@ -219,8 +219,7 @@ public class PDFParserTest extends TestC
         //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
     }
 
-    // TIKA-738: re-enable this
-    public void IGNOREtestAnnotations() throws Exception {
+    public void testAnnotations() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
@@ -236,15 +235,160 @@ public class PDFParserTest extends TestC
         content = content.replaceAll("[\\s\u00a0]+"," ");
         assertContains("Here is some text", content);
         assertContains("Here is a comment", content);
+
+        // Test w/ annotation text disabled:
+        PDFParser pdfParser = new PDFParser();
+        pdfParser.setExtractAnnotationText(false);
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        stream = PDFParserTest.class.getResourceAsStream("testAnnotations.pdf");
+        try {
+            pdfParser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        content = handler.toString();
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        assertContains("Here is some text", content);
+        assertEquals(-1, content.indexOf("Here is a comment"));
+
+        // TIKA-738: make sure no extra </p> tags
+        String xml = getXML("testAnnotations.pdf").xml;
+        assertEquals(substringCount("<p>", xml),
+                substringCount("</p>", xml));
+    }
+
+    private static int substringCount(String needle, String haystack) {
+        int upto = -1;
+        int count = 0;
+        while(true) {
+            final int next = haystack.indexOf(needle, upto);
+            if (next == -1) {
+                break;
+            }
+            count++;
+            upto = next+1;
+        }
+
+        return count;
     }
 
     public void testPageNumber() throws Exception {
-        String result = getXML("testPageNumber.pdf");
+        String result = getXML("testPageNumber.pdf").xml;
         String content = result.replaceAll("\\s+","");
         assertContains("<p>1</p>", content);
     }
 
-    private String getXML(String filename) throws Exception {
+    public void testDisableAutoSpace() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.setEnableAutoSpace(false);
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = PDFParserTest.class.getResourceAsStream("testExtraSpaces.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        String content = handler.toString();
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        // Text is correct when autoSpace is off:
+        assertContains("Here is some formatted text", content);
+
+        parser.setEnableAutoSpace(true);
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        stream = PDFParserTest.class.getResourceAsStream("testExtraSpaces.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        content = handler.toString();
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        // Text is correct when autoSpace is off:
+
+        // Text has extra spaces when autoSpace is on
+        assertEquals(-1, content.indexOf("Here is some formatted text"));
+    }
+
+    public void testDuplicateOverlappingText() throws Exception {
+        PDFParser parser = new PDFParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream =PDFParserTest.class.getResourceAsStream("testOverlappingText.pdf");
+        // Default is false (keep overlapping text):
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        String content = handler.toString();
+        assertContains("Text the first timeText the second time", content);
+
+        parser.setSuppressDuplicateOverlappingText(true);
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        stream = PDFParserTest.class.getResourceAsStream("testOverlappingText.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        content = handler.toString();
+        // "Text the first" was dedup'd:
+        assertContains("Text the first timesecond time", content);
+    }
+
+    public void testSortByPosition() throws Exception {
+        PDFParser parser = new PDFParser();
+        parser.setEnableAutoSpace(false);
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = PDFParserTest.class.getResourceAsStream("testPDFTwoTextBoxes.pdf");
+        // Default is false (do not sort):
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        String content = handler.toString();
+        content = content.replaceAll("\\s+", " ");
+        assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
+
+        parser.setSortByPosition(true);
+        handler = new BodyContentHandler();
+        metadata = new Metadata();
+        context = new ParseContext();
+        stream = PDFParserTest.class.getResourceAsStream("testPDFTwoTextBoxes.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        content = handler.toString();
+        content = content.replaceAll("\\s+", " ");
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
+    }
+
+    private static class XMLResult {
+        public final String xml;
+        public final Metadata metadata;
+
+        public XMLResult(String xml, Metadata metadata) {
+            this.xml = xml;
+            this.metadata = metadata;
+      }
+    }
+
+    private XMLResult getXML(String filename) throws Exception {
         Metadata metadata = new Metadata();
         Parser parser = new AutoDetectParser(); // Should auto-detect!
         StringWriter sw = new StringWriter();
@@ -255,12 +399,11 @@ public class PDFParserTest extends TestC
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
         handler.setResult(new StreamResult(sw));
 
-        // Try with a document containing various tables and formatting
-        InputStream input = PDFParserTest.class.getResourceAsStream(
-                filename);
+        // Try with a document containing various tables and formattings
+        InputStream input = PDFParserTest.class.getResourceAsStream(filename);
         try {
             parser.parse(input, handler, metadata, new ParseContext());
-            return sw.toString();
+            return new XMLResult(sw.toString(), metadata);
         } finally {
             input.close();
         }

Added: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf?rev=1220781&view=auto
==============================================================================
Binary file - no diff available.

Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
------------------------------------------------------------------------------
    svn:executable = *

Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testExtraSpaces.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf

Added: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf?rev=1220781&view=auto
==============================================================================
Binary file - no diff available.

Propchange: pdfbox/trunk/pdfbox/src/test/resources/org/apache/pdfbox/tika/testOverlappingText.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf