You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ju...@apache.org on 2011/10/13 16:39:29 UTC
svn commit: r1182880 - in /pdfbox/trunk: fontbox/
fontbox/src/main/java/org/apache/fontbox/tika/ fontbox/src/main/resources/
fontbox/src/main/resources/META-INF/
fontbox/src/main/resources/META-INF/services/
fontbox/src/test/java/org/apache/fontbox/tik...
Author: jukka
Date: Thu Oct 13 14:39:28 2011
New Revision: 1182880
URL: http://svn.apache.org/viewvc?rev=1182880&view=rev
Log:
PDFBOX-1132: Add Tika parser classes
Copy PDF and TTF parser classes and related test cases from Tika.
Added:
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/
pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java
pdfbox/trunk/fontbox/src/main/resources/
pdfbox/trunk/fontbox/src/main/resources/META-INF/
pdfbox/trunk/fontbox/src/main/resources/META-INF/services/
pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/
pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
pdfbox/trunk/pdfbox/src/main/resources/META-INF/
pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/
pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
Modified:
pdfbox/trunk/fontbox/pom.xml
pdfbox/trunk/parent/pom.xml
pdfbox/trunk/pdfbox/pom.xml
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
Modified: pdfbox/trunk/fontbox/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/fontbox/pom.xml (original)
+++ pdfbox/trunk/fontbox/pom.xml Thu Oct 13 14:39:28 2011
@@ -39,6 +39,11 @@
<dependencies>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
@@ -51,7 +56,6 @@
<plugin>
<groupId>org.apache.felix</groupId>
<artifactId>maven-bundle-plugin</artifactId>
- <version>2.0.0</version>
<extensions>true</extensions>
</plugin>
</plugins>
Added: pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java (added)
+++ pdfbox/trunk/fontbox/src/main/java/org/apache/fontbox/tika/TrueTypeParser.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.fontbox.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.fontbox.ttf.TTFParser;
+import org.apache.fontbox.ttf.TrueTypeFont;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tika parser for TrueType font files (TTF).
+ *
+ * @since Apache Fontbox 1.7.0
+ */
+public class TrueTypeParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 7276565828404664974L;
+
+ private static final MediaType TYPE =
+ MediaType.application("x-font-ttf");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(TYPE);
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TrueTypeFont font;
+ TTFParser parser = new TTFParser();
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis != null && tis.hasFile()) {
+ font = parser.parseTTF(tis.getFile());
+ } else {
+ font = parser.parseTTF(stream);
+ }
+
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ metadata.set(DublinCore.DATE, font.getHeader().getCreated().getTime());
+ metadata.set(
+ Property.internalDate(DublinCore.MODIFIED),
+ font.getHeader().getModified().getTime());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Added: pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ pdfbox/trunk/fontbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Oct 13 14:39:28 2011
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.fontbox.tika.TrueTypeParser
Added: pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java (added)
+++ pdfbox/trunk/fontbox/src/test/java/org/apache/fontbox/tika/TrueTypeParserTest.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.fontbox.tika;
+
+import java.io.BufferedInputStream;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+
+public class TrueTypeParserTest extends TestCase {
+
+ public void testTrueTypeParsing() throws Exception {
+ Tika tika = new Tika();
+ String type = "application/x-font-ttf";
+
+ Metadata metadata = new Metadata();
+ InputStream stream = new BufferedInputStream(
+ TrueTypeParserTest.class.getResourceAsStream(
+ "testTrueType.ttf"));
+ assertEquals(type, tika.detect(stream));
+ assertEquals("", tika.parseToString(stream, metadata));
+ assertEquals("1903-12-31T23:00:00Z", metadata.get(Metadata.DATE));
+ assertEquals("1903-12-31T23:00:00Z", metadata.get(Metadata.MODIFIED));
+ assertEquals(type, metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+}
Modified: pdfbox/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/parent/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/parent/pom.xml (original)
+++ pdfbox/trunk/parent/pom.xml Thu Oct 13 14:39:28 2011
@@ -108,11 +108,26 @@
</excludes>
</configuration>
</plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <version>2.3.4</version>
+ </plugin>
</plugins>
</pluginManagement>
</build>
- <!-- Developers listed by PMC Chair, PMC all alphabetical-->
+ <dependencyManagement>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>0.10</version>
+ </dependency>
+ </dependencies>
+ </dependencyManagement>
+
+ <!-- Developers listed by PMC Chair, PMC all alphabetical-->
<developers>
<developer>
<id>lehmi</id>
Modified: pdfbox/trunk/pdfbox/pom.xml
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/pom.xml?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/pom.xml (original)
+++ pdfbox/trunk/pdfbox/pom.xml Thu Oct 13 14:39:28 2011
@@ -71,6 +71,11 @@
<optional>true</optional>
</dependency>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
@@ -129,7 +134,6 @@
<plugin>
<groupId>org.apache.felix</groupId>
<artifactId>maven-bundle-plugin</artifactId>
- <version>2.0.1</version>
<extensions>true</extensions>
<configuration>
<instructions>
Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDF2XHTML.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+ // TODO: remove once PDFBOX-1130 is fixed:
+ private boolean inParagraph = false;
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if the PDF document can not be processed
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, Metadata metadata)
+ throws SAXException, TikaException {
+ try {
+ // Extract text using a dummy Writer as we override the
+ // key methods to output to the given content handler.
+ new PDF2XHTML(handler, metadata).writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+ @Override
+ public void flush() {
+ }
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ }
+
+ private final XHTMLContentHandler handler;
+
+ private PDF2XHTML(ContentHandler handler, Metadata metadata)
+ throws IOException {
+ this.handler = new XHTMLContentHandler(handler, metadata);
+ setForceParsing(true);
+ setSortByPosition(false);
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ handler.startElement("div", "class", "page");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ }
+ }
+
+ @Override
+ protected void writeParagraphStart() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (inParagraph) {
+ // Close last paragraph
+ writeParagraphEnd();
+ }
+ assert !inParagraph;
+ inParagraph = true;
+ try {
+ handler.startElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a paragraph", e);
+ }
+ }
+
+ @Override
+ protected void writeParagraphEnd() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (!inParagraph) {
+ writeParagraphStart();
+ }
+ assert inParagraph;
+ inParagraph = false;
+ try {
+ handler.endElement("p");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a paragraph", e);
+ }
+ }
+
+ @Override
+ protected void writeString(String text) throws IOException {
+ try {
+ handler.characters(text);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a string: " + text, e);
+ }
+ }
+
+ @Override
+ protected void writeCharacters(TextPosition text) throws IOException {
+ try {
+ handler.characters(text.getCharacter());
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a character: " + text.getCharacter(), e);
+ }
+ }
+
+ @Override
+ protected void writeWordSeparator() throws IOException {
+ try {
+ handler.characters(" ");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a space character", e);
+ }
+ }
+
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ try {
+ handler.characters("\n");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(
+ "Unable to write a newline character", e);
+ }
+ }
+
+}
Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (added)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tika parser for PDF documents.
+ * <p>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs.
+ *
+ * @since Apache PDFBox 1.7.0
+ */
+public class PDFParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -752276948656079347L;
+
+ /**
+ * Metadata key for giving the document password to the parser.
+ */
+ public static final String PASSWORD = "org.apache.pdfbox.tika.password";
+
+ /**
+ * Metadata key for giving the document password to the parser.
+ *
+ * @since Apache Tika 0.5
+ */
+ private static final String OLD_PASSWORD =
+ "org.apache.tika.parser.pdf.password";
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("pdf"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ PDDocument pdfDocument =
+ PDDocument.load(new CloseShieldInputStream(stream), true);
+ try {
+ if (pdfDocument.isEncrypted()) {
+ try {
+ String password = metadata.get(PASSWORD);
+ if (password == null) {
+ password = metadata.get(OLD_PASSWORD);
+ }
+ if (password == null) {
+ password = "";
+ }
+ pdfDocument.decrypt(password);
+ } catch (Exception e) {
+ // Ignore
+ }
+ }
+ metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+ extractMetadata(pdfDocument, metadata);
+ PDF2XHTML.process(pdfDocument, handler, metadata);
+ } finally {
+ pdfDocument.close();
+ }
+ }
+
+ private void extractMetadata(PDDocument document, Metadata metadata)
+ throws TikaException {
+ PDDocumentInformation info = document.getDocumentInformation();
+ metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+ addMetadata(metadata, Metadata.TITLE, info.getTitle());
+ addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+ addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+ addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+ addMetadata(metadata, "producer", info.getProducer());
+ addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+ addMetadata(metadata, "trapped", info.getTrapped());
+ try {
+ addMetadata(metadata, "created", info.getCreationDate());
+ addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
+ } catch (IOException e) {
+ // Invalid date format, just ignore
+ }
+ try {
+ Calendar modified = info.getModificationDate();
+ addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+ } catch (IOException e) {
+ // Invalid date format, just ignore
+ }
+
+ // All remaining metadata is custom
+ // Copy this over as-is
+ List<String> handledMetadata = Arrays.asList(
+ "Author", "Creator", "CreationDate", "ModDate",
+ "Keywords", "Producer", "Subject", "Title", "Trapped");
+ for (COSName key : info.getDictionary().keySet()) {
+ String name = key.getName();
+ if (!handledMetadata.contains(name)) {
+ addMetadata(
+ metadata, name,
+ info.getDictionary().getDictionaryObject(key));
+ }
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.add(name, value);
+ }
+ }
+
+ private void addMetadata(Metadata metadata, String name, Calendar value) {
+ if (value != null) {
+ metadata.set(name, value.getTime().toString());
+ }
+ }
+
+ private void addMetadata(
+ Metadata metadata, Property property, Calendar value) {
+ if (value != null) {
+ metadata.set(property, value.getTime());
+ }
+ }
+
+ /**
+ * Used when processing custom metadata entries, as PDFBox won't do
+ * the conversion for us in the way it does for the standard ones
+ */
+ private void addMetadata(Metadata metadata, String name, COSBase value) {
+ if (value instanceof COSArray) {
+ for (COSBase v : ((COSArray)value).toList()) {
+ addMetadata(metadata, name, v);
+ }
+ } else if (value instanceof COSString) {
+ addMetadata(metadata, name, ((COSString) value).getString());
+ } else {
+ addMetadata(metadata, name, value.toString());
+ }
+ }
+}
Added: pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ pdfbox/trunk/pdfbox/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Oct 13 14:39:28 2011
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.pdfbox.tika.PDFParser
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java?rev=1182880&r1=1182879&r2=1182880&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/TestAll.java Thu Oct 13 14:39:28 2011
@@ -31,6 +31,7 @@ import org.apache.pdfbox.pdmodel.TestFDF
import org.apache.pdfbox.pdmodel.TestPDDocumentCatalog;
import org.apache.pdfbox.pdmodel.TestPDDocumentInformation;
import org.apache.pdfbox.pdmodel.interactive.form.TestFields;
+import org.apache.pdfbox.tika.PDFParserTest;
import org.apache.pdfbox.util.TestDateUtil;
import org.apache.pdfbox.util.TestMatrix;
@@ -90,6 +91,8 @@ public class TestAll extends TestCase
suite.addTestSuite( TestPackedBitArray.class );
suite.addTestSuite( TestCCITTFaxG31DDecodeInputStream.class );
+ suite.addTestSuite( PDFParserTest.class );
+
return suite;
}
}
Added: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java?rev=1182880&view=auto
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (added)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java Thu Oct 13 14:39:28 2011
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.tika;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TestCase {
+
+ public void testPdfParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testPDF.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
+ assertEquals("Apache Tika - Apache Tika", metadata.get(Metadata.TITLE));
+
+ // Can't reliably test dates yet - see TIKA-451
+// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
+// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("Tika - Content Analysis Toolkit"));
+ assertTrue(content.contains("incubator"));
+ assertTrue(content.contains("Apache Software Foundation"));
+ // testing how the end of one paragraph is separated from start of the next one
+ assertTrue("should have word boundary after headline",
+ !content.contains("ToolkitApache"));
+ assertTrue("should have word boundary between paragraphs",
+ !content.contains("libraries.Apache"));
+ }
+
+ public void testCustomMetadata() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testPDF-custommetadata.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Document author", metadata.get(Metadata.AUTHOR));
+ assertEquals("Document title", metadata.get(Metadata.TITLE));
+
+ assertEquals("Custom Value", metadata.get("Custom Property"));
+ assertEquals("Array Entry 1", metadata.get("Custom Array"));
+ assertEquals(2, metadata.getValues("Custom Array").length);
+ assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
+ assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+
+ String content = handler.toString();
+ assertTrue(content.contains("Hello World!"));
+ }
+
+ /**
+ * PDFs can be "protected" with the default password. This means
+ * they're encrypted (potentially both text and metadata),
+ * but we can decrypt them easily.
+ */
+ public void testProtectedPDF() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testPDF_protected.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
+ assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
+ assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(Metadata.TITLE));
+
+ String content = handler.toString();
+ assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
+ assertTrue(content.contains("On 16 November 2002"));
+ assertTrue(content.contains("In many important respects"));
+ }
+
+ public void testTwoTextBoxes() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testPDFTwoTextBoxes.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ String content = handler.toString();
+ content = content.replaceAll("\\s+"," ");
+ assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
+ }
+
+ public void testVarious() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testPDFVarious.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ String content = handler.toString();
+ //content = content.replaceAll("\\s+"," ");
+ assertContains("Footnote appears here", content);
+ assertContains("This is a footnote.", content);
+ assertContains("This is the header text.", content);
+ assertContains("This is the footer text.", content);
+ assertContains("Here is a text box", content);
+ assertContains("Bold", content);
+ assertContains("italic", content);
+ assertContains("underline", content);
+ assertContains("superscript", content);
+ assertContains("subscript", content);
+ assertContains("Here is a citation:", content);
+ assertContains("Figure 1 This is a caption for Figure 1", content);
+ assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
+ assertContains("This is a hyperlink", content);
+ assertContains("Here is a list:", content);
+ for(int row=1;row<=3;row++) {
+ //assertContains("·\tBullet " + row, content);
+ //assertContains("\u00b7\tBullet " + row, content);
+ assertContains("Bullet " + row, content);
+ }
+ assertContains("Here is a numbered list:", content);
+ for(int row=1;row<=3;row++) {
+ //assertContains(row + ")\tNumber bullet " + row, content);
+ assertContains(row + ") Number bullet " + row, content);
+ }
+
+ for(int row=1;row<=2;row++) {
+ for(int col=1;col<=3;col++) {
+ assertContains("Row " + row + " Col " + col, content);
+ }
+ }
+
+ assertContains("Keyword1 Keyword2", content);
+ assertEquals("Keyword1 Keyword2",
+ metadata.get(Metadata.KEYWORDS));
+
+ assertContains("Subject is here", content);
+ assertEquals("Subject is here",
+ metadata.get(Metadata.SUBJECT));
+
+ assertContains("Suddenly some Japanese text:", content);
+ // Special version of (GHQ)
+ assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+ // 6 other characters
+ assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
+
+ assertContains("And then some Gothic text:", content);
+ // TODO: I saved the word doc as a PDF, but that
+ // process somehow, apparently lost the gothic
+ // chars, so we cannot test this here:
+ //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ }
+
+ // TIKA-738: re-enable this
+ public void IGNOREtestAnnotations() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "testAnnotations.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ String content = handler.toString();
+ content = content.replaceAll("[\\s\u00a0]+"," ");
+ assertContains("Here is some text", content);
+ assertContains("Here is a comment", content);
+ }
+
+ public void testPageNumber() throws Exception {
+ String result = getXML("testPageNumber.pdf");
+ String content = result.replaceAll("\\s+","");
+ assertContains("<p>1</p>", content);
+ }
+
+ private String getXML(String filename) throws Exception {
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ // Try with a document containing various tables and formatting
+ InputStream input = PDFParserTest.class.getResourceAsStream(
+ filename);
+ try {
+ parser.parse(input, handler, metadata, new ParseContext());
+ return sw.toString();
+ } finally {
+ input.close();
+ }
+ }
+
+ private void assertContains(String needle, String haystack) {
+ assertTrue(
+ "\"" + needle + "\" not found in \"" + haystack + "\"",
+ haystack.contains(needle));
+ }
+
+}