You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/30 13:52:13 UTC
svn commit: r959275 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDFParser.java
test/java/org/apache/tika/parser/pdf/
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: nick
Date: Wed Jun 30 11:52:12 2010
New Revision: 959275
URL: http://svn.apache.org/viewvc?rev=959275&view=rev
Log:
TIKA-452 - Extract custom pdf metadata
Updates the PDF Parser to extract custom metadata. Also includes a PDFParser unit test (previously missing)
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=959275&r1=959274&r2=959275&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Wed Jun 30 11:52:12 2010
@@ -18,10 +18,16 @@ package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
+import java.util.List;
import java.util.Set;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.tika.exception.TikaException;
@@ -103,6 +109,7 @@ public class PDFParser implements Parser
addMetadata(metadata, "trapped", info.getTrapped());
try {
addMetadata(metadata, "created", info.getCreationDate());
+ addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
} catch (IOException e) {
// Invalid date format, just ignore
}
@@ -112,6 +119,19 @@ public class PDFParser implements Parser
} catch (IOException e) {
// Invalid date format, just ignore
}
+
+ // All remaining metadata is custom
+ // Copy this over as-is
+ List<String> handledMetadata = Arrays.asList(new String[] {
+ "Author", "Creator", "CreationData", "ModDate",
+ "Keywords", "Producer", "Subject", "Title", "Trapped"
+ });
+ for(COSName key : info.getDictionary().keySet()) {
+ String name = key.getName();
+ if(! handledMetadata.contains(name)) {
+ addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
+ }
+ }
}
private void addMetadata(Metadata metadata, String name, String value) {
@@ -126,4 +146,19 @@ public class PDFParser implements Parser
}
}
+ /**
+ * Used when processing custom metadata entries, as PDFBox won't do
+ * the conversion for us in the way it does for the standard ones
+ */
+ private void addMetadata(Metadata metadata, String name, COSBase value) {
+ if(value instanceof COSArray) {
+ for(COSBase v : ((COSArray)value).toList()) {
+ addMetadata(metadata, name, v);
+ }
+ } else if(value instanceof COSString) {
+ addMetadata(metadata, name, ((COSString)value).getString());
+ } else {
+ addMetadata(metadata, name, value.toString());
+ }
+ }
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=959275&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Wed Jun 30 11:52:12 2010
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing pdf files.
+ */
+public class PDFParserTest extends TestCase {
+
+ public void testPdfParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF.pdf");
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacrétaz", metadata.get(Metadata.AUTHOR));
+ assertEquals("Apache Tika - Apache Tika", metadata.get(Metadata.TITLE));
+
+ // Can't reliably test dates yet - see TIKA-451
+// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
+// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("Tika - Content Analysis Toolkit"));
+ assertTrue(content.contains("incubator"));
+ assertTrue(content.contains("Apache Software Foundation"));
+ }
+}