You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/07 19:10:00 UTC
tika git commit: TIKA-1894 - Add XMPMM support to PDFParser and
JpegParser via Jempbox
Repository: tika
Updated Branches:
refs/heads/2.x 35d1b2ad0 -> dc4ca999c
TIKA-1894 - Add XMPMM support to PDFParser and JpegParser via Jempbox
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc4ca999
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc4ca999
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc4ca999
Branch: refs/heads/2.x
Commit: dc4ca999c2855814158868af97e877cbcc74079a
Parents: 35d1b2a
Author: tballison <ta...@mitre.org>
Authored: Mon Mar 7 13:09:47 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Mar 7 13:09:47 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 5 +-
.../java/org/apache/tika/metadata/XMPMM.java | 44 +++++
.../tika-parser-multimedia-bundle/pom.xml | 1 +
.../tika-parser-pdf-bundle/pom.xml | 1 +
tika-parser-modules/pom.xml | 5 +-
.../tika-parser-multimedia-module/pom.xml | 10 +-
.../apache/tika/parser/image/TiffParser.java | 2 +-
.../tika/parser/image/xmp/JempboxExtractor.java | 97 ----------
.../tika/parser/image/xmp/XMPPacketScanner.java | 113 -----------
.../org/apache/tika/parser/jpeg/JpegParser.java | 2 +-
.../parser/image/xmp/JempboxExtractorTest.java | 107 -----------
.../apache/tika/parser/jpeg/JpegParserTest.java | 16 ++
.../tika-parser-pdf-module/pom.xml | 5 +
.../org/apache/tika/parser/pdf/PDFParser.java | 15 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 57 ++++++
.../tika-parser-xmp-module/pom.xml | 52 ++++++
.../tika/module/xmp/internal/Activator.java | 36 ++++
.../tika/parser/xmp/JempboxExtractor.java | 187 +++++++++++++++++++
.../tika/parser/xmp/XMPPacketScanner.java | 113 +++++++++++
.../tika/parser/xmp/JempboxExtractorTest.java | 107 +++++++++++
20 files changed, 643 insertions(+), 332 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e9d696d..d4611f0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,9 @@ Release 2.0 - Future Development
* (Something about more specific parser bundles, plus an overall one)
Release 1.13 - ???
+
+ * Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894).
+
* Move serialization of TikaConfig to tika-core and enable dumping
of the config file via tika-app (TIKA-1657).
@@ -16,7 +19,7 @@ Release 1.13 - ???
* Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861). NOTE: this dependency
is still <scope>provided</scope>. You need to include this dependency
- in order to parser sqlite files.
+ in order to parse sqlite files.
* Upgrade to POI 3.14-beta1 (TIKA-1799).
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
index 3fc4dfa..1a5ef6d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
@@ -72,4 +72,48 @@ public interface XMPMM {
Property RENDITION_PARAMS = Property.externalText(
PREFIX_ + "RenditionParams");
+ /**
+ * Instance id in the XMPMM's history section
+ */
+ Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag(
+ PREFIX_+"History:InstanceID"
+ );
+
+ /**
+ * Action in the XMPMM's history section
+ */
+ Property HISTORY_ACTION = Property.externalTextBag(
+ PREFIX_+"History:Action"
+ );
+ /**
+ * When the action occurred in the XMPMM's history section
+ */
+ Property HISTORY_WHEN = Property.externalTextBag(
+ PREFIX_+"History:When"
+ );
+
+ /**
+ * Software agent that created the action in the XMPMM's
+ * history section
+ */
+ Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag(
+ PREFIX_+"History:SoftwareAgent"
+ );
+
+ /**
+ * Document id for the document that this document
+ * was derived from
+ */
+ Property DERIVED_FROM_DOCUMENTID = Property.externalText(
+ PREFIX_+"DerivedFrom:DocumentID"
+ );
+
+ /**
+ * Instance id for the document instance that this
+ * document was derived from
+ */
+ Property DERIVED_FROM_INSTANCEID = Property.externalText(
+ PREFIX_+"DerivedFrom:InstanceID"
+ );
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
index 85e09f8..7b528bc 100644
--- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
@@ -45,6 +45,7 @@
com.sun.xml.internal.bind.marshaller</_runsystempackages>
<Embed-Dependency>
tika-parser-multimedia-module;inline=true,
+ tika-parser-xmp-module;inline=true,
metadata-extractor;inline=true,
xmpcore;inline=true,
commons-codec;inline=true,
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index 08cd863..27773a8 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -47,6 +47,7 @@
<Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
<Embed-Dependency>
tika-parser-pdf-module;inline=true,
+ tika-parser-xmp-module;inline=true,
commons-io;inline=true,
pdfbox;inline=true,
bcmail-jdk15on;inline=true,
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index ce5edd3..8a3435a 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -56,6 +56,7 @@
<module>tika-parser-scientific-module</module>
<module>tika-parser-text-module</module>
<module>tika-parser-web-module</module>
+ <module>tika-parser-xmp-module</module>
</modules>
<dependencies>
@@ -72,7 +73,7 @@
<optional>true</optional>
</dependency>
<!-- Test dependencies -->
- <dependency>
+ <dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
@@ -86,7 +87,7 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
- <dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index f15f3bd..63ea5aa 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -36,6 +36,11 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
<groupId>com.drewnoakes</groupId>
<artifactId>metadata-extractor</artifactId>
<version>${metadata.extractor.version}</version>
@@ -82,11 +87,6 @@
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
- <artifactId>jempbox</artifactId>
- <version>${pdfbox.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
index 3be436b..c98ce69 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
@@ -28,7 +28,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.parser.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
deleted file mode 100644
index 20d3db5..0000000
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image.xmp;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.List;
-
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.jempbox.xmp.XMPSchemaDublinCore;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.xml.sax.InputSource;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-public class JempboxExtractor {
-
- // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
- private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
- private XMPPacketScanner scanner = new XMPPacketScanner();
- private Metadata metadata;
-
- public JempboxExtractor(Metadata metadata) {
- this.metadata = metadata;
- }
-
- public void parse(InputStream file) throws IOException, TikaException {
- ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
- if (!scanner.parse(file, xmpraw)) {
- return;
- }
-
- Reader decoded = new InputStreamReader(
- new ByteArrayInputStream(xmpraw.toByteArray()),
- DEFAULT_XMP_CHARSET);
- try {
- XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
- XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
- if (dc != null) {
- if (dc.getTitle() != null) {
- metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
- }
- if (dc.getDescription() != null) {
- metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
- }
- if (dc.getCreators() != null && dc.getCreators().size() > 0) {
- metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
- }
- if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
- for (String keyword : dc.getSubjects()) {
- metadata.add(TikaCoreProperties.KEYWORDS, keyword);
- }
- // TODO should we set KEYWORDS too?
- // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
- }
- }
- } catch (IOException e) {
- // Could not parse embedded XMP metadata. That's not a serious
- // problem, so we'll just ignore the issue for now.
- // TODO: Make error handling like this configurable.
- }
- }
-
- protected String joinCreators(List<String> creators) {
- if (creators == null || creators.size() == 0) {
- return "";
- }
- if (creators.size() == 1) {
- return creators.get(0);
- }
- StringBuffer c = new StringBuffer();
- for (String s : creators) {
- c.append(", ").append(s);
- }
- return c.substring(2);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
deleted file mode 100644
index d4fa4bc..0000000
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
-
-package org.apache.tika.parser.image.xmp;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-
-/**
- * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet
- * it finds and parses it.
- * <p/>
- * Important: Before you use this class to look for an XMP packet in some random file, please read
- * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
- * <p/>
- * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
- * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
- */
-public class XMPPacketScanner {
-
- private static final byte[] PACKET_HEADER;
- private static final byte[] PACKET_HEADER_END;
- private static final byte[] PACKET_TRAILER;
-
- static {
- PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
- PACKET_HEADER_END = "?>".getBytes(US_ASCII);
- PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
- }
-
- private static boolean skipAfter(InputStream in, byte[] match) throws IOException {
- return skipAfter(in, match, null);
- }
-
- private static boolean skipAfter(InputStream in, byte[] match, OutputStream out)
- throws IOException {
- int found = 0;
- int len = match.length;
- int b;
- while ((b = in.read()) >= 0) {
- if (b == match[found]) {
- found++;
- if (found == len) {
- return true;
- }
- } else {
- if (out != null) {
- if (found > 0) {
- out.write(match, 0, found);
- }
- out.write(b);
- }
- found = 0;
- }
- }
- return false;
- }
-
- /**
- * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no
- * XMP packet is found until the stream ends, null is returned. Note: This method
- * only finds the first XMP packet in a stream. And it cannot determine whether it
- * has found the right XMP packet if there are multiple packets.
- * <p/>
- * Does <em>not</em> close the stream.
- * If XMP block was found reading can continue below the block.
- *
- * @param in the InputStream to search
- * @param xmlOut to write the XMP packet to
- * @return true if XMP packet is found, false otherwise
- * @throws IOException if an I/O error occurs
- * @throws TransformerException if an error occurs while parsing the XMP packet
- */
- public boolean parse(InputStream in, OutputStream xmlOut) throws IOException {
- if (!in.markSupported()) {
- in = new java.io.BufferedInputStream(in);
- }
- boolean foundXMP = skipAfter(in, PACKET_HEADER);
- if (!foundXMP) {
- return false;
- }
- //TODO Inspect "begin" attribute!
- if (!skipAfter(in, PACKET_HEADER_END)) {
- throw new IOException("Invalid XMP packet header!");
- }
- //TODO Do with TeeInputStream when Commons IO 1.4 is available
- if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
- throw new IOException("XMP packet not properly terminated!");
- }
- return true;
- }
-
-}
-
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
index d13cd62..247194e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
@@ -29,7 +29,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.image.ImageMetadataExtractor;
-import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.parser.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
deleted file mode 100644
index e389f17..0000000
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.image.xmp;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class JempboxExtractorTest {
-
- @Test
- public void testParseJpeg() throws IOException, TikaException {
- Metadata metadata = new Metadata();
- InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
- // set some values before extraction to see that they are overridden
- metadata.set(TikaCoreProperties.TITLE, "old title");
- metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
- metadata.set(TikaCoreProperties.CREATOR, "previous author");
- // ... or kept in case the field is multi-value
- metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword");
-
- JempboxExtractor extractor = new JempboxExtractor(metadata);
- extractor.parse(stream);
-
- // DublinCore fields
- assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
- Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue(keywords.contains("oldkeyword"));
- assertTrue(keywords.contains("grazelands"));
- assertTrue(keywords.contains("nature reserve"));
- assertTrue(keywords.contains("bird watching"));
- assertTrue(keywords.contains("coast"));
- Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
- assertTrue(subject.contains("oldkeyword"));
- assertTrue(subject.contains("grazelands"));
- assertTrue(subject.contains("nature reserve"));
- assertTrue(subject.contains("bird watching"));
- assertTrue(subject.contains("coast"));
- }
-
- @Test
- public void testParseJpegPhotoshop() throws IOException, TikaException {
- Metadata metadata = new Metadata();
- InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
-
- JempboxExtractor extractor = new JempboxExtractor(metadata);
- extractor.parse(stream);
-
- // DublinCore fields
- assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
- Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue(keywords.contains("bird watching"));
- assertTrue(keywords.contains("coast"));
- }
-
- @Test
- public void testParseJpegXnviewmp() throws IOException, TikaException {
- Metadata metadata = new Metadata();
- InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
-
- JempboxExtractor extractor = new JempboxExtractor(metadata);
- extractor.parse(stream);
-
- // XnViewMp fields not understood by Jempbox
- assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
- Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
- assertTrue(keywords.contains("coast"));
- assertTrue(keywords.contains("nature reserve"));
- }
-
- @Test
- public void testJoinCreators() {
- assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
- Arrays.asList("Mr B")));
- // TODO use multi-value property instead?
- assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
- Arrays.asList("Mr B", "Mr A")));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index f187545..1f08476 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -27,6 +27,7 @@ import java.util.List;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.junit.Test;
@@ -247,4 +248,19 @@ public class JpegParserTest {
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
}
+
+ @Test
+ public void testJPEGXMPMM() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ //TODO: when jempbox is fixed/xmpbox is used
+ //add tests for history...currently not extracted
+ assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
+ metadata.get(XMPMM.DOCUMENTID));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index dfe2f0a..a706ff3 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -34,6 +34,11 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-xmp-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 65f0b9c..6fe0396 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -58,6 +58,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -223,19 +224,23 @@ public class PDFParser extends AbstractParser {
Boolean.toString(ap.canPrintDegraded()));
- //now go for the XMP stuff
+ //now go for the XMP
org.apache.jempbox.xmp.XMPMetadata xmp = null;
XMPSchemaDublinCore dcSchema = null;
try {
if (document.getDocumentCatalog().getMetadata() != null) {
xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
}
- if (xmp != null) {
+ } catch (IOException e) {}
+
+ if (xmp != null) {
+ try {
dcSchema = xmp.getDublinCoreSchema();
- }
- } catch (IOException e) {
- //swallow
+ } catch (IOException e) {}
+
+ JempboxExtractor.extractXMPMM(xmp, metadata);
}
+
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8005c5b..a8bfaed 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.pdf;
+import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@@ -31,6 +32,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
+
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
@@ -45,6 +47,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -1357,6 +1360,60 @@ public class PDFParserTest extends TikaTest {
assertNotContained("Mount Rushmore National Memorial", xml);
}
+ @Test
+ public void testXMPMM() throws Exception {
+// XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf");
+ Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
+ assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e",
+ m.get(XMPMM.DOCUMENTID));
+
+ m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata;
+ assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25",
+ m.get(XMPMM.DOCUMENTID));
+
+ //now test for 7 elements in each parallel array
+ //from the history section
+ assertArrayEquals(new String[]{
+ "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf",
+ "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c",
+ "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d",
+ "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f",
+ "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa",
+ "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36",
+ "uuid:c1669773-a6ca-4bdd-aade-519030d0af00"
+ }, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID));
+
+ assertArrayEquals(new String[]{
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted"
+ }, m.getValues(XMPMM.HISTORY_ACTION));
+
+ assertArrayEquals(new String[]{
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight"
+ }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
+
+ assertArrayEquals(new String[]{
+ "2014-03-04T23:50:41Z",
+ "2014-03-04T23:50:42Z",
+ "2014-03-04T23:51:34Z",
+ "2014-03-04T23:51:36Z",
+ "2014-03-04T23:51:37Z",
+ "2014-03-04T23:52:22Z",
+ "2014-03-04T23:54:48Z"
+ }, m.getValues(XMPMM.HISTORY_WHEN));
+ }
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/pom.xml b/tika-parser-modules/tika-parser-xmp-module/pom.xml
new file mode 100644
index 0000000..2101075
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-module/pom.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-xmp-module</artifactId>
+ <name>Apache Tika parser xmp module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <mime4j.version>0.7.2</mime4j.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jempbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
new file mode 100644
index 0000000..4161c6e
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.xmp.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
new file mode 100644
index 0000000..aa72896
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Calendar;
+import java.util.List;
+
+import org.apache.jempbox.xmp.ResourceEvent;
+import org.apache.jempbox.xmp.ResourceRef;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.InputSource;
+
+public class JempboxExtractor {
+
+ // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
+ private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+ private XMPPacketScanner scanner = new XMPPacketScanner();
+ private Metadata metadata;
+
+ public JempboxExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parse(InputStream file) throws IOException, TikaException {
+ ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
+ if (!scanner.parse(file, xmpraw)) {
+ return;
+ }
+
+ Reader decoded = new InputStreamReader(
+ new ByteArrayInputStream(xmpraw.toByteArray()),
+ DEFAULT_XMP_CHARSET);
+ XMPMetadata xmp = null;
+ try {
+ xmp = XMPMetadata.load(new InputSource(decoded));
+ } catch (IOException e) {
+ //
+ }
+
+ if (xmp == null) {
+ return;
+ }
+ XMPSchemaDublinCore dc = null;
+ try {
+ dc = xmp.getDublinCoreSchema();
+ } catch (IOException e) {
+ }
+
+ if (dc != null) {
+ if (dc.getTitle() != null) {
+ metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
+ }
+ if (dc.getDescription() != null) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
+ }
+ if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+ metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
+ }
+ if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+ for (String keyword : dc.getSubjects()) {
+ metadata.add(TikaCoreProperties.KEYWORDS, keyword);
+ }
+ // TODO should we set KEYWORDS too?
+ // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
+ }
+ }
+ extractXMPMM(xmp, metadata);
+ }
+
+ protected String joinCreators(List<String> creators) {
+ if (creators == null || creators.size() == 0) {
+ return "";
+ }
+ if (creators.size() == 1) {
+ return creators.get(0);
+ }
+ StringBuffer c = new StringBuffer();
+ for (String s : creators) {
+ c.append(", ").append(s);
+ }
+ return c.substring(2);
+ }
+
+ /**
+ * Extracts Media Management metadata from XMP.
+ *
+ * Silently swallows exceptions.
+ * @param xmp
+ * @param metadata
+ */
+ public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
+ XMPSchemaMediaManagement mmSchema = null;
+ try {
+ mmSchema = xmp.getMediaManagementSchema();
+ } catch (IOException e) {
+ //swallow
+ return;
+ }
+ if (mmSchema != null) {
+ addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
+ //not currently supported by JempBox...
+// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
+
+ ResourceRef derivedFrom = mmSchema.getDerivedFrom();
+ if (derivedFrom != null) {
+ try {
+ addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID());
+ } catch (NullPointerException e) {}
+
+ try {
+ addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID());
+ } catch (NullPointerException e) {}
+
+ //TODO: not yet supported by XMPBox...extract OriginalDocumentID
+ //in DerivedFrom section
+ }
+ if (mmSchema.getHistory() != null) {
+ for (ResourceEvent stevt : mmSchema.getHistory()) {
+ String instanceId = null;
+ String action = null;
+ Calendar when = null;
+ String softwareAgent = null;
+ try {
+ instanceId = stevt.getInstanceID();
+ action = stevt.getAction();
+ when = stevt.getWhen();
+ softwareAgent = stevt.getSoftwareAgent();
+
+ //instanceid can throw npe; getWhen can throw IOException
+ } catch (NullPointerException|IOException e) {
+ //swallow
+ }
+ if (instanceId != null && instanceId.trim().length() > 0) {
+ //for absent data elements, pass in empty strings so
+ //that parallel arrays will have matching offsets
+ //for absent data
+
+ action = (action == null) ? "" : action;
+ String dateString = (when == null) ? "" : DateUtils.formatDate(when);
+ softwareAgent = (softwareAgent == null) ? "" : softwareAgent;
+
+ metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
+ metadata.add(XMPMM.HISTORY_ACTION, action);
+ metadata.add(XMPMM.HISTORY_WHEN, dateString);
+ metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
+ }
+ }
+ }
+ }
+ }
+
+ private static void addMetadata(Metadata m, Property p, String value) {
+ if (value != null) {
+ m.add(p, value);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
new file mode 100644
index 0000000..70018cd
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
+
+package org.apache.tika.parser.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet
+ * it finds and parses it.
+ * <p/>
+ * Important: Before you use this class to look for an XMP packet in some random file, please read
+ * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
+ * <p/>
+ * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
+ * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
+ */
+public class XMPPacketScanner {
+
+ private static final byte[] PACKET_HEADER;
+ private static final byte[] PACKET_HEADER_END;
+ private static final byte[] PACKET_TRAILER;
+
+ static {
+ PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
+ PACKET_HEADER_END = "?>".getBytes(US_ASCII);
+ PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
+ }
+
+ private static boolean skipAfter(InputStream in, byte[] match) throws IOException {
+ return skipAfter(in, match, null);
+ }
+
+ private static boolean skipAfter(InputStream in, byte[] match, OutputStream out)
+ throws IOException {
+ int found = 0;
+ int len = match.length;
+ int b;
+ while ((b = in.read()) >= 0) {
+ if (b == match[found]) {
+ found++;
+ if (found == len) {
+ return true;
+ }
+ } else {
+ if (out != null) {
+ if (found > 0) {
+ out.write(match, 0, found);
+ }
+ out.write(b);
+ }
+ found = 0;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no
+ * XMP packet is found until the stream ends, null is returned. Note: This method
+ * only finds the first XMP packet in a stream. And it cannot determine whether it
+ * has found the right XMP packet if there are multiple packets.
+ * <p/>
+ * Does <em>not</em> close the stream.
+ * If XMP block was found reading can continue below the block.
+ *
+ * @param in the InputStream to search
+ * @param xmlOut to write the XMP packet to
+ * @return true if XMP packet is found, false otherwise
+ * @throws IOException if an I/O error occurs
+ * @throws TransformerException if an error occurs while parsing the XMP packet
+ */
+ public boolean parse(InputStream in, OutputStream xmlOut) throws IOException {
+ if (!in.markSupported()) {
+ in = new java.io.BufferedInputStream(in);
+ }
+ boolean foundXMP = skipAfter(in, PACKET_HEADER);
+ if (!foundXMP) {
+ return false;
+ }
+ //TODO Inspect "begin" attribute!
+ if (!skipAfter(in, PACKET_HEADER_END)) {
+ throw new IOException("Invalid XMP packet header!");
+ }
+ //TODO Do with TeeInputStream when Commons IO 1.4 is available
+ if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
+ throw new IOException("XMP packet not properly terminated!");
+ }
+ return true;
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
new file mode 100644
index 0000000..849fd01
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class JempboxExtractorTest {
+
+ @Test
+ public void testParseJpeg() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+ // set some values before extraction to see that they are overridden
+ metadata.set(TikaCoreProperties.TITLE, "old title");
+ metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
+ metadata.set(TikaCoreProperties.CREATOR, "previous author");
+ // ... or kept in case the field is multi-value
+ metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue(keywords.contains("oldkeyword"));
+ assertTrue(keywords.contains("grazelands"));
+ assertTrue(keywords.contains("nature reserve"));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue(subject.contains("oldkeyword"));
+ assertTrue(subject.contains("grazelands"));
+ assertTrue(subject.contains("nature reserve"));
+ assertTrue(subject.contains("bird watching"));
+ assertTrue(subject.contains("coast"));
+ }
+
+ @Test
+ public void testParseJpegPhotoshop() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ }
+
+ @Test
+ public void testParseJpegXnviewmp() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // XnViewMp fields not understood by Jempbox
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("nature reserve"));
+ }
+
+ @Test
+ public void testJoinCreators() {
+ assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
+ Arrays.asList("Mr B")));
+ // TODO use multi-value property instead?
+ assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
+ Arrays.asList("Mr B", "Mr A")));
+ }
+
+}