You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/13 02:04:31 UTC

svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Author: jukka
Date: Mon Nov 12 17:04:30 2007
New Revision: 594376

URL: http://svn.apache.org/viewvc?rev=594376&view=rev
Log:
TIKA-100 - Structured PDF parsing
    - Customized the PdfTextStripper class to produce XHTML SAX events
      (there's a somewhat similar PdfText2HTML class in PDFBox, but
      that class produces a character stream instead of SAX events)

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java   (with props)
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Nov 12 17:04:30 2007
@@ -125,3 +125,5 @@
 56. TIKA-84 - Add MimeTypes.getMimeType(InputStream) (jukka)
 
 57. TIKA-85 - Add glob patterns from the ASF svn:eol-style documentation (jukka)
+
+58. TIKA-100 - Structured PDF parsing (jukka)

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Nov 12 17:04:30 2007
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.util.PDFTextStripper;
+import org.pdfbox.util.TextPosition;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     * 
+     * @param document PDF document
+     * @param handler SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException if the content handler fails to process SAX events
+     * @throws TikaException if the PDF document can not be processed
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, Metadata metadata)
+            throws SAXException, TikaException {
+        try {
+            new PDF2XHTML(handler, metadata).getText(document);
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+    }
+
+    private final XHTMLContentHandler handler;
+
+    private PDF2XHTML(ContentHandler handler, Metadata metadata)
+            throws IOException {
+        this.handler = new XHTMLContentHandler(handler, metadata);
+    }
+
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a document", e);
+        }
+    }
+
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a document", e);
+        }
+    }
+
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            handler.startElement("div");
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a page", e);
+        }
+    }
+
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a page", e);
+        }
+    }
+
+    protected void startParagraph() throws IOException {
+        try {
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a paragraph", e);
+        }
+    }
+
+    protected void endParagraph() throws IOException {
+        try {
+            handler.endElement("p");
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a paragraph", e);
+        }
+    }
+
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            handler.characters(text.getCharacter());
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a newline", e);
+        }
+    }
+
+    protected void processLineSeparator(TextPosition p) throws IOException {
+        try {
+            handler.characters("\n");
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a newline", e);
+        }
+    }
+
+    protected void processWordSeparator(TextPosition a, TextPosition b)
+            throws IOException {
+        try {
+            handler.characters(" ");
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a space", e);
+        }
+    }
+
+}

Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Mon Nov 12 17:04:30 2007
@@ -18,17 +18,14 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.util.Calendar;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
 
 import org.pdfbox.pdmodel.PDDocument;
 import org.pdfbox.pdmodel.PDDocumentInformation;
-import org.pdfbox.util.PDFTextStripper;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -40,64 +37,55 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
+        PDDocument pdfDocument = PDDocument.load(stream);
         try {
-            PDDocument pdfDocument = PDDocument.load(stream);
-            try {
-                if (pdfDocument.isEncrypted()) {
+            if (pdfDocument.isEncrypted()) {
+                try {
                     pdfDocument.decrypt("");
+                } catch (Exception e) {
+                    // Ignore
                 }
-
-                PDDocumentInformation info =
-                    pdfDocument.getDocumentInformation();
-                if (info.getTitle() != null) {
-                    metadata.set(Metadata.TITLE, info.getTitle());
-                }
-                if (info.getAuthor() != null) {
-                    metadata.set(Metadata.AUTHOR, info.getAuthor());
-                }
-                if (info.getCreator() != null) {
-                    metadata.set(Metadata.CREATOR, info.getCreator());
-                }
-                if (info.getKeywords() != null) {
-                    metadata.set(Metadata.KEYWORDS, info.getKeywords());
-                }
-                if (info.getProducer() != null) {
-                    // TODO: Need a Metadata key for producer
-                    metadata.set("producer", info.getProducer());
-                }
-                if (info.getSubject() != null) {
-                    metadata.set(Metadata.SUBJECT, info.getSubject());
-                }
-                if (info.getTrapped() != null) {
-                    // TODO: Need a Metadata key for producer
-                    metadata.set("trapped", info.getTrapped());
-                }
-                Calendar created = info.getCreationDate();
-                if (created != null) {
-                    metadata.set("created", created.getTime().toString());
-                }
-                Calendar modified = info.getModificationDate();
-                if (modified != null) {
-                    metadata.set(
-                            Metadata.LAST_MODIFIED,
-                            modified.getTime().toString());
-                }
-
-                StringWriter writer = new StringWriter();
-                new PDFTextStripper().writeText(pdfDocument, writer);
-
-                XHTMLContentHandler xhtml =
-                    new XHTMLContentHandler(handler, metadata);
-                xhtml.startDocument();
-                xhtml.element("p", writer.getBuffer().toString());
-                xhtml.endDocument();
-            } finally {
-                pdfDocument.close();
             }
+            metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+            PDF2XHTML.process(pdfDocument, handler, metadata);
+        } finally {
+            pdfDocument.close();
+        }
+    }
+
+    private void extractMetadata(PDDocument document, Metadata metadata)
+            throws TikaException {
+        PDDocumentInformation info = document.getDocumentInformation();
+        addMetadata(metadata, Metadata.TITLE, info.getTitle());
+        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+        try {
+            addMetadata(metadata, "created", info.getCreationDate());
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        try {
+            Calendar modified = info.getModificationDate(); 
+            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
         } catch (IOException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Error parsing a PDF document", e);
+            // Invalid date format, just ignore
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, value);
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
         }
     }
 



Re: svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Posted by Jukka Zitting <ju...@gmail.com>.
Hi,

On Nov 18, 2007 7:54 PM, Jeremias Maerki <de...@jeremias-maerki.ch> wrote:
> The constructor IOException(String, Exception) only exists since Java 6.
> I don't think that was intended, was it?

Oh, bugger. Certainly not intended (see [1]), I just wrongly recalled
that the constructor would have been available already in Java 5.

Thanks, Chris, for fixing the problem.

PS. Does anyone have an idea on how Maven could be made to select
which JDK to use based on project metadata?

[1] http://jukkaz.wordpress.com/2007/05/17/the-cause-of-an-ioexception/

BR,

Jukka Zitting

Re: svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Posted by Chris Mattmann <ch...@jpl.nasa.gov>.
Hi Guys,

 This has been fixed in r596143:

 http://svn.apache.org/viewvc?rev=596143&view=rev

Cheers,
  Chris
 

On 11/18/07 9:54 AM, "Jeremias Maerki" <de...@jeremias-maerki.ch> wrote:

> The constructor IOException(String, Exception) only exists since Java 6.
> I don't think that was intended, was it?
> 
> Jeremias Maerki
> 
> 
> 
> On 13.11.2007 02:04:31 jukka wrote:
>> Author: jukka
>> Date: Mon Nov 12 17:04:30 2007
>> New Revision: 594376
>> 
>> URL: http://svn.apache.org/viewvc?rev=594376&view=rev
>> Log:
>> TIKA-100 - Structured PDF parsing
>>     - Customized the PdfTextStripper class to produce XHTML SAX events
>>       (there's a somewhat similar PdfText2HTML class in PDFBox, but
>>       that class produces a character stream instead of SAX events)
>> 
>> Added:
>>     
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> (with props)
>> Modified:
>>     incubator/tika/trunk/CHANGES.txt
>>     
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
>> 
> <snip/>
>> Added: 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> URL: 
>> http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/ti
>> ka/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
>> 
=============================================================================>>
=
>> --- 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> (added)
>> +++ 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> Mon Nov 12 17:04:30 2007
>> +    protected void endDocument(PDDocument pdf) throws IOException {
>> +        try {
>> +            handler.endDocument();
>> +        } catch (SAXException e) {
>> +            throw new IOException("Unable to end a document", e);
>> +        }
>> +    }
> 

______________________________________________
Chris Mattmann, Ph.D.
Chris.Mattmann@jpl.nasa.gov
Cognizant Development Engineer
Early Detection Research Network Project
_________________________________________________
Jet Propulsion Laboratory            Pasadena, CA
Office: 171-266B                     Mailstop:  171-246
_______________________________________________________

Disclaimer:  The opinions presented within are my own and do not reflect
those of either NASA, JPL, or the California Institute of Technology.



Re: svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Posted by Chris Mattmann <ch...@jpl.nasa.gov>.
I've verified this behavior as well while trying to apply and commit the
patch for TIKA-101. I think that the trunk is broken. I'll go ahead and fix
it. 

In the future, we should probably have nightly builds to catch stuff like
this. Also, please try to be more vigilant about making sure that your
environment is set to JDK 5 before committing an update.

Thanks!

Cheers,
  Chris



On 11/18/07 9:54 AM, "Jeremias Maerki" <de...@jeremias-maerki.ch> wrote:

> The constructor IOException(String, Exception) only exists since Java 6.
> I don't think that was intended, was it?
> 
> Jeremias Maerki
> 
> 
> 
> On 13.11.2007 02:04:31 jukka wrote:
>> Author: jukka
>> Date: Mon Nov 12 17:04:30 2007
>> New Revision: 594376
>> 
>> URL: http://svn.apache.org/viewvc?rev=594376&view=rev
>> Log:
>> TIKA-100 - Structured PDF parsing
>>     - Customized the PdfTextStripper class to produce XHTML SAX events
>>       (there's a somewhat similar PdfText2HTML class in PDFBox, but
>>       that class produces a character stream instead of SAX events)
>> 
>> Added:
>>     
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> (with props)
>> Modified:
>>     incubator/tika/trunk/CHANGES.txt
>>     
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
>> 
> <snip/>
>> Added: 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> URL: 
>> http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/ti
>> ka/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
>> 
=============================================================================>>
=
>> --- 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> (added)
>> +++ 
>> incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
>> Mon Nov 12 17:04:30 2007
>> +    protected void endDocument(PDDocument pdf) throws IOException {
>> +        try {
>> +            handler.endDocument();
>> +        } catch (SAXException e) {
>> +            throw new IOException("Unable to end a document", e);
>> +        }
>> +    }
> 

______________________________________________
Chris Mattmann, Ph.D.
Chris.Mattmann@jpl.nasa.gov
Cognizant Development Engineer
Early Detection Research Network Project
_________________________________________________
Jet Propulsion Laboratory            Pasadena, CA
Office: 171-266B                     Mailstop:  171-246
_______________________________________________________

Disclaimer:  The opinions presented within are my own and do not reflect
those of either NASA, JPL, or the California Institute of Technology.



Re: svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Posted by Jeremias Maerki <de...@jeremias-maerki.ch>.
The constructor IOException(String, Exception) only exists since Java 6.
I don't think that was intended, was it?

Jeremias Maerki



On 13.11.2007 02:04:31 jukka wrote:
> Author: jukka
> Date: Mon Nov 12 17:04:30 2007
> New Revision: 594376
> 
> URL: http://svn.apache.org/viewvc?rev=594376&view=rev
> Log:
> TIKA-100 - Structured PDF parsing
>     - Customized the PdfTextStripper class to produce XHTML SAX events
>       (there's a somewhat similar PdfText2HTML class in PDFBox, but
>       that class produces a character stream instead of SAX events)
> 
> Added:
>     incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java   (with props)
> Modified:
>     incubator/tika/trunk/CHANGES.txt
>     incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
> 
<snip/>
> Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
> URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
> ==============================================================================
> --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (added)
> +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Nov 12 17:04:30 2007
> +    protected void endDocument(PDDocument pdf) throws IOException {
> +        try {
> +            handler.endDocument();
> +        } catch (SAXException e) {
> +            throw new IOException("Unable to end a document", e);
> +        }
> +    }