You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2011/10/13 16:39:29 UTC

svn commit: r1182880 - in /pdfbox/trunk: fontbox/ fontbox/src/main/java/org/apache/fontbox/tika/ fontbox/src/main/resources/ fontbox/src/main/resources/META-INF/ fontbox/src/main/resources/META-INF/services/ fontbox/src/test/java/org/apache/fontbox/tik...

Author: jukka
Date: Thu Oct 13 14:39:28 2011
New Revision: 1182880

URL: http://svn.apache.org/viewvc?rev=1182880&view=rev
Log:
PDFBOX-1132: Add Tika parser classes

Copy PDF and TTF parser classes and related test cases from Tika.

Added:
    pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/
    pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java
    pdfbox/trunk/fontbox/src/main/resources/
    pdfbox/trunk/fontbox/src/main/resources/META-INF/
    pdfbox/trunk/fontbox/src/main/resources/META-INF/services/
    pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
    pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/
    pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
    pdfbox/trunk/pdfbox/src/main/resources/META-INF/
    pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/
    pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
Modified:
    pdfbox/trunk/fontbox/pom.xml
    pdfbox/trunk/parent/pom.xml
    pdfbox/trunk/pdfbox/pom.xml
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java

Modified: pdfbox/trunk/fontbox/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/pom.xml (original)
+++ pdfbox/trunk/fontbox/pom.xml Thu Oct 13 14:39:28 2011
@@ -39,6 +39,11 @@
 
   <dependencies>
     <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+      <optional>true</optional>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>4.8.1</version>
@@ -51,7 +56,6 @@
       <plugin>
         <groupId>org.apache.felix</groupId>
         <artifactId>maven-bundle-plugin</artifactId>
-        <version>2.0.0</version>
         <extensions>true</extensions>
       </plugin>
     </plugins>

Added: pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java (added)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.fontbox.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tika parser for TrueType font files (TTF).
+ *
+ * @since Apache Fontbox 1.7.0
+ */
+public class TrueTypeParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 7276565828404664974L;
+
+    private static final MediaType TYPE =
+        MediaType.application("x-font-ttf");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(TYPE);
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TrueTypeFont font;
+        TTFParser parser = new TTFParser();
+        TikaInputStream tis = TikaInputStream.cast(stream);
+        if (tis != null && tis.hasFile()) {
+            font = parser.parseTTF(tis.getFile());
+        } else {
+            font = parser.parseTTF(stream);
+        }
+
+        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+        metadata.set(DublinCore.DATE, font.getHeader().getCreated().getTime());
+        metadata.set(
+                Property.internalDate(DublinCore.MODIFIED),
+                font.getHeader().getModified().getTime());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Added: pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Oct 13 14:39:28 2011
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.fontbox.tika.TrueTypeParser

Added: pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java (added)
+++ pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.fontbox.tika;
+
+import java.io.BufferedInputStream;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+
+public class TrueTypeParserTest extends TestCase {
+
+    public void testTrueTypeParsing() throws Exception {
+        Tika tika = new Tika();
+        String type = "application/x-font-ttf";
+
+        Metadata metadata = new Metadata();
+        InputStream stream = new BufferedInputStream(
+                TrueTypeParserTest.class.getResourceAsStream(
+                        "testTrueType.ttf"));
+        assertEquals(type, tika.detect(stream));
+        assertEquals("", tika.parseToString(stream, metadata));
+        assertEquals("1903-12-31T23:00:00Z", metadata.get(Metadata.DATE));
+        assertEquals("1903-12-31T23:00:00Z", metadata.get(Metadata.MODIFIED));
+        assertEquals(type, metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+}

Modified: pdfbox/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/parent/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/parent/pom.xml (original)
+++ pdfbox/trunk/parent/pom.xml Thu Oct 13 14:39:28 2011
@@ -108,11 +108,26 @@
               </excludes>
             </configuration>
           </plugin>
+          <plugin>
+            <groupId>org.apache.felix</groupId>
+            <artifactId>maven-bundle-plugin</artifactId>
+            <version>2.3.4</version>
+          </plugin>
       </plugins>
     </pluginManagement>
   </build>
 
- <!-- Developers listed by PMC Chair, PMC all alphabetical-->
+  <dependencyManagement>
+    <dependencies>
+      <dependency>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-core</artifactId>
+        <version>0.10</version>
+      </dependency>
+    </dependencies>
+  </dependencyManagement>
+
+  <!-- Developers listed by PMC Chair, PMC all alphabetical-->
   <developers>
     <developer>
         <id>lehmi</id>

Modified: pdfbox/trunk/pdfbox/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/pom.xml (original)
+++ pdfbox/trunk/pdfbox/pom.xml Thu Oct 13 14:39:28 2011
@@ -71,6 +71,11 @@
       <optional>true</optional>
     </dependency>
     <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+      <optional>true</optional>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>4.8.1</version>
@@ -129,7 +134,6 @@
       <plugin>
         <groupId>org.apache.felix</groupId>
         <artifactId>maven-bundle-plugin</artifactId>
-        <version>2.0.1</version>
         <extensions>true</extensions>
         <configuration>
           <instructions>

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+    // TODO: remove once PDFBOX-1130 is fixed:
+    private boolean inParagraph = false;
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException if the content handler fails to process SAX events
+     * @throws TikaException if the PDF document can not be processed
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, Metadata metadata)
+            throws SAXException, TikaException {
+        try {
+            // Extract text using a dummy Writer as we override the
+            // key methods to output to the given content handler.
+            new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+                @Override
+                public void flush() {
+                }
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+    }
+
+    private final XHTMLContentHandler handler;
+
+    private PDF2XHTML(ContentHandler handler, Metadata metadata)
+            throws IOException {
+        this.handler = new XHTMLContentHandler(handler, metadata);
+        setForceParsing(true);
+        setSortByPosition(false);
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            handler.startElement("div", "class", "page");
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            handler.endElement("p");
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        }
+    }
+
+    @Override
+    protected void writeParagraphStart() throws IOException {
+        // TODO: remove once PDFBOX-1130 is fixed
+        if (inParagraph) {
+            // Close last paragraph
+            writeParagraphEnd();
+        }
+        assert !inParagraph;
+        inParagraph = true;
+        try {
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeParagraphEnd() throws IOException {
+        // TODO: remove once PDFBOX-1130 is fixed
+        if (!inParagraph) {
+            writeParagraphStart();
+        }
+        assert inParagraph;
+        inParagraph = false;
+        try {
+            handler.endElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        try {
+            handler.characters(text);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a string: " + text, e);
+        }
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            handler.characters(text.getCharacter());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a character: " + text.getCharacter(), e);
+        }
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        try {
+            handler.characters(" ");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a space character", e);
+        }
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        try {
+            handler.characters("\n");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a newline character", e);
+        }
+    }
+
+}

Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tika parser for PDF documents.
+ * <p>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs.
+ *
+ * @since Apache PDFBox 1.7.0
+ */
+public class PDFParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -752276948656079347L;
+
+    /**
+     * Metadata key for giving the document password to the parser.
+     */
+    public static final String PASSWORD = "org.apache.pdfbox.tika.password";
+
+    /**
+     * Metadata key for giving the document password to the parser.
+     *
+     * @since Apache Tika 0.5
+     */
+    private static final String OLD_PASSWORD =
+            "org.apache.tika.parser.pdf.password";
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("pdf"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        PDDocument pdfDocument =
+            PDDocument.load(new CloseShieldInputStream(stream), true);
+        try {
+            if (pdfDocument.isEncrypted()) {
+                try {
+                    String password = metadata.get(PASSWORD);
+                    if (password == null) {
+                        password = metadata.get(OLD_PASSWORD);
+                    }
+                    if (password == null) {
+                        password = "";
+                    }
+                    pdfDocument.decrypt(password);
+                } catch (Exception e) {
+                    // Ignore
+                }
+            }
+            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+            PDF2XHTML.process(pdfDocument, handler, metadata);
+        } finally {
+            pdfDocument.close();
+        }
+    }
+
+    private void extractMetadata(PDDocument document, Metadata metadata)
+            throws TikaException {
+        PDDocumentInformation info = document.getDocumentInformation();
+        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+        addMetadata(metadata, Metadata.TITLE, info.getTitle());
+        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+        try {
+            addMetadata(metadata, "created", info.getCreationDate());
+            addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        try {
+            Calendar modified = info.getModificationDate(); 
+            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+
+        // All remaining metadata is custom
+        // Copy this over as-is
+        List<String> handledMetadata = Arrays.asList(
+                "Author", "Creator", "CreationDate", "ModDate",
+                "Keywords", "Producer", "Subject", "Title", "Trapped");
+        for (COSName key : info.getDictionary().keySet()) {
+            String name = key.getName();
+            if (!handledMetadata.contains(name)) {
+                addMetadata(
+                        metadata, name,
+                        info.getDictionary().getDictionaryObject(key));
+            }
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, value);
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
+        }
+    }
+
+    private void addMetadata(
+            Metadata metadata, Property property, Calendar value) {
+        if (value != null) {
+            metadata.set(property, value.getTime());
+        }
+    }
+
+    /**
+     * Used when processing custom metadata entries, as PDFBox won't do
+     * the conversion for us in the way it does for the standard ones
+     */
+    private void addMetadata(Metadata metadata, String name, COSBase value) {
+        if (value instanceof COSArray) {
+            for (COSBase v : ((COSArray)value).toList()) {
+                addMetadata(metadata, name, v);
+            }
+        } else if (value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString) value).getString());
+        } else {
+            addMetadata(metadata, name, value.toString());
+        }
+    }
+}

Added: pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Oct 13 14:39:28 2011
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.pdfbox.tika.PDFParser

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java Thu Oct 13 14:39:28 2011
@@ -31,6 +31,7 @@ import org.apache.pdfbox.pdmodel.TestFDF
 import org.apache.pdfbox.pdmodel.TestPDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.TestPDDocumentInformation;
 import org.apache.pdfbox.pdmodel.interactive.form.TestFields;
+import org.apache.pdfbox.tika.PDFParserTest;
 import org.apache.pdfbox.util.TestDateUtil;
 import org.apache.pdfbox.util.TestMatrix;
 
@@ -90,6 +91,8 @@ public class TestAll extends TestCase
         suite.addTestSuite( TestPackedBitArray.class );
         suite.addTestSuite( TestCCITTFaxG31DDecodeInputStream.class );
 
+        suite.addTestSuite( PDFParserTest.class );
+
         return suite;
     }
 }

Added: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (added)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TestCase {
+
+    public void testPdfParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "testPDF.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
+        assertEquals("Apache Tika - Apache Tika", metadata.get(Metadata.TITLE));
+        
+        // Can't reliably test dates yet - see TIKA-451 
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
+//        assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Apache Tika"));
+        assertTrue(content.contains("Tika - Content Analysis Toolkit"));
+        assertTrue(content.contains("incubator"));
+        assertTrue(content.contains("Apache Software Foundation"));
+        // testing how the end of one paragraph is separated from start of the next one
+        assertTrue("should have word boundary after headline", 
+                !content.contains("ToolkitApache"));
+        assertTrue("should have word boundary between paragraphs", 
+                !content.contains("libraries.Apache"));
+    }
+
+    public void testCustomMetadata() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "testPDF-custommetadata.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Document author", metadata.get(Metadata.AUTHOR));
+        assertEquals("Document title", metadata.get(Metadata.TITLE));
+        
+        assertEquals("Custom Value", metadata.get("Custom Property"));
+        assertEquals("Array Entry 1", metadata.get("Custom Array"));
+        assertEquals(2, metadata.getValues("Custom Array").length);
+        assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
+        assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+        
+        String content = handler.toString();
+        assertTrue(content.contains("Hello World!"));
+    }
+    
+    /**
+     * PDFs can be "protected" with the default password. This means
+     *  they're encrypted (potentially both text and metadata),
+     *  but we can decrypt them easily.
+     */
+    public void testProtectedPDF() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+       ParseContext context = new ParseContext();
+
+       InputStream stream = PDFParserTest.class.getResourceAsStream(
+               "testPDF_protected.pdf");
+       try {
+           parser.parse(stream, handler, metadata, context);
+       } finally {
+           stream.close();
+       }
+
+       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+       assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+       assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(Metadata.TITLE));
+
+       String content = handler.toString();
+       assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
+       assertTrue(content.contains("On 16 November 2002"));
+       assertTrue(content.contains("In many important respects"));
+    }
+
+    public void testTwoTextBoxes() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "testPDFTwoTextBoxes.pdf");
+        try {
+          parser.parse(stream, handler, metadata, context);
+        } finally {
+          stream.close();
+        }
+
+        String content = handler.toString();
+        content = content.replaceAll("\\s+"," ");
+        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
+    }
+
+    public void testVarious() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "testPDFVarious.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+
+        String content = handler.toString();
+        //content = content.replaceAll("\\s+"," ");
+        assertContains("Footnote appears here", content);
+        assertContains("This is a footnote.", content);
+        assertContains("This is the header text.", content);
+        assertContains("This is the footer text.", content);
+        assertContains("Here is a text box", content);
+        assertContains("Bold", content);
+        assertContains("italic", content);
+        assertContains("underline", content);
+        assertContains("superscript", content);
+        assertContains("subscript", content);
+        assertContains("Here is a citation:", content);
+        assertContains("Figure 1 This is a caption for Figure 1", content);
+        assertContains("(Kramer)", content);
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+        assertContains("This is a hyperlink", content);
+        assertContains("Here is a list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains("·\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("Bullet " + row, content);
+        }
+        assertContains("Here is a numbered list:", content);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            assertContains(row + ") Number bullet " + row, content);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, content);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", content);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", content);
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", content);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+        assertContains("And then some Gothic text:", content);
+        // TODO: I saved the word doc as a PDF, but that
+        // process somehow, apparently lost the gothic
+        // chars, so we cannot test this here:
+        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+    }
+
+    // TIKA-738: re-enable this
+    public void IGNOREtestAnnotations() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = PDFParserTest.class.getResourceAsStream(
+                "testAnnotations.pdf");
+        try {
+            parser.parse(stream, handler, metadata, context);
+        } finally {
+            stream.close();
+        }
+        String content = handler.toString();
+        content = content.replaceAll("[\\s\u00a0]+"," ");
+        assertContains("Here is some text", content);
+        assertContains("Here is a comment", content);
+    }
+
+    public void testPageNumber() throws Exception {
+        String result = getXML("testPageNumber.pdf");
+        String content = result.replaceAll("\\s+","");
+        assertContains("<p>1</p>", content);
+    }
+
+    private String getXML(String filename) throws Exception {
+        Metadata metadata = new Metadata();
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.setResult(new StreamResult(sw));
+
+        // Try with a document containing various tables and formatting
+        InputStream input = PDFParserTest.class.getResourceAsStream(
+                filename);
+        try {
+            parser.parse(input, handler, metadata, new ParseContext());
+            return sw.toString();
+        } finally {
+            input.close();
+        }
+    }
+
+    private void assertContains(String needle, String haystack) {
+        assertTrue(
+                "\"" + needle + "\" not found in \"" + haystack + "\"",
+                haystack.contains(needle));
+    }
+
+}