You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/10/18 21:11:48 UTC
svn commit: r1185805 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika-core/src/main/java/org/apache/tika/metadata/Property.java
tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
Author: jukka
Date: Tue Oct 18 19:11:47 2011
New Revision: 1185805
URL: http://svn.apache.org/viewvc?rev=1185805&view=rev
Log:
TIKA-756: XMP output from Tika CLI
First draft of XMP metadata output. WIP...
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1185805&r1=1185804&r2=1185805&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Oct 18 19:11:47 2011
@@ -75,6 +75,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XMPContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -192,14 +193,32 @@ public class TikaCLI {
};
private final OutputType JSON = new OutputType() {
- @Override
- protected ContentHandler getContentHandler(OutputStream output)
- throws Exception {
- final PrintWriter writer =
- new PrintWriter(getOutputWriter(output, encoding));
- return new NoDocumentJSONMetHandler(writer);
- }
- };
+ @Override
+ protected ContentHandler getContentHandler(OutputStream output)
+ throws Exception {
+ final PrintWriter writer =
+ new PrintWriter(getOutputWriter(output, encoding));
+ return new NoDocumentJSONMetHandler(writer);
+ }
+ };
+
+ private final OutputType XMP = new OutputType() {
+ @Override
+ protected ContentHandler getContentHandler(OutputStream output)
+ throws Exception {
+ final ContentHandler handler =
+ getTransformerHandler(output, "xml", encoding, prettyPrint);
+ return new DefaultHandler() {
+ @Override
+ public void endDocument() throws SAXException {
+ XMPContentHandler xmp = new XMPContentHandler(handler);
+ xmp.startDocument();
+ xmp.metadata(metadata);
+ xmp.endDocument();
+ }
+ };
+ }
+ };
private final OutputType LANGUAGE = new OutputType() {
@Override
@@ -308,7 +327,9 @@ public class TikaCLI {
} else if (arg.startsWith("--encoding=")) {
encoding = arg.substring("--encoding=".length());
} else if (arg.equals("-j") || arg.equals("--json")) {
- type = JSON;
+ type = JSON;
+ } else if (arg.equals("-y") || arg.equals("--xmp")) {
+ type = XMP;
} else if (arg.equals("-x") || arg.equals("--xml")) {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
@@ -386,10 +407,11 @@ public class TikaCLI {
out.println();
out.println(" -x or --xml Output XHTML content (default)");
out.println(" -h or --html Output HTML content");
- out.println(" -j or --json Output JSON content");
out.println(" -t or --text Output plain text content");
out.println(" -T or --text-main Output plain text content (main content only)");
out.println(" -m or --metadata Output only metadata");
+ out.println(" -j or --json Output metadata in JSON");
+ out.println(" -y or --xmp Output metadata in XMP");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
out.println(" -eX or --encoding=X Use output encoding X");
@@ -430,7 +452,7 @@ public class TikaCLI {
out.println();
out.println("- Server mode");
out.println();
- out.println(" Use the \"-server\" (or \"-s\") option to start the");
+ out.println(" Use the \"--server\" (or \"-s\") option to start the");
out.println(" Apache Tika server. The server will listen to the");
out.println(" ports you specify as one or more arguments.");
out.println();
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java?rev=1185805&r1=1185804&r2=1185805&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java Tue Oct 18 19:11:47 2011
@@ -18,8 +18,12 @@ package org.apache.tika.metadata;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
/**
* XMP property definition. Each instance of this class defines a single
@@ -30,7 +34,7 @@ import java.util.Set;
*
* @since Apache Tika 0.7
*/
-public final class Property {
+public final class Property implements Comparable<Property> {
public static enum PropertyType {
SIMPLE, STRUCTURE, BAG, SEQ, ALT
@@ -41,6 +45,9 @@ public final class Property {
MIME_TYPE, PROPER_NAME, RATIONAL, REAL, TEXT, URI, URL, XPATH
}
+ private static final Map<String, Property> properties =
+ new HashMap<String, Property>();
+
private final String name;
private final boolean internal;
@@ -67,6 +74,10 @@ public final class Property {
} else {
this.choices = null;
}
+
+ synchronized (properties) {
+ properties.put(name, this);
+ }
}
private Property(
@@ -116,6 +127,19 @@ public final class Property {
return choices;
}
+ public static SortedSet<Property> getProperties(String prefix) {
+ SortedSet<Property> set = new TreeSet<Property>();
+ String p = prefix + ":";
+ synchronized (properties) {
+ for (String name : properties.keySet()) {
+ if (name.startsWith(p)) {
+ set.add(properties.get(name));
+ }
+ }
+ }
+ return set;
+ }
+
public static Property internalBoolean(String name) {
return new Property(name, true, ValueType.BOOLEAN);
}
@@ -182,4 +206,20 @@ public final class Property {
return new Property(name, false, ValueType.TEXT);
}
+ //----------------------------------------------------------< Comparable >
+
+ public int compareTo(Property o) {
+ return name.compareTo(o.name);
+ }
+
+ //--------------------------------------------------------------< Object >
+
+ public boolean equals(Object o) {
+ return o instanceof Property && name.equals(((Property) o).name);
+ }
+
+ public int hashCode() {
+ return name.hashCode();
+ }
+
}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java?rev=1185805&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java Tue Oct 18 19:11:47 2011
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that simplifies the task of producing XMP output.
+ *
+ * @since Apache Tika 1.0
+ */
+public class XMPContentHandler extends SafeContentHandler {
+
+ /**
+ * The RDF namespace URI
+ */
+ public static final String RDF =
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+ /**
+ * The XMP namespace URI
+ */
+ public static final String XMP =
+ "http://ns.adobe.com/xap/1.0/";
+
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ public XMPContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ /**
+ * Starts an XMP document by setting up the namespace mappings and
+ * writing out the following header:
+ * <pre>
+ * <rdf:RDF>
+ * </pre>
+ */
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+
+ startPrefixMapping("rdf", RDF);
+ startPrefixMapping("xmp", XMP);
+
+ startElement(RDF, "RDF", "rdf:RDF", EMPTY_ATTRIBUTES);
+ }
+
+ /**
+ * Ends the XMP document by writing the following footer and
+ * clearing the namespace mappings:
+ * <pre>
+ * </rdf:RDF>
+ * </pre>
+ */
+ @Override
+ public void endDocument() throws SAXException {
+ endElement(RDF, "RDF", "rdf:RDF");
+
+ endPrefixMapping("xmp");
+ endPrefixMapping("rdf");
+
+ super.endDocument();
+ }
+
+ //------------------------------------------< public convenience methods >
+
+ private String prefix = null;
+
+ private String uri = null;
+
+ public void startDescription(String about, String prefix, String uri)
+ throws SAXException {
+ this.prefix = prefix;
+ this.uri = uri;
+
+ startPrefixMapping(prefix, uri);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute(RDF, "about", "rdf:about", "CDATA", about);
+ startElement(RDF, "Description", "rdf:Description", attributes);
+ }
+
+ public void endDescription() throws SAXException {
+ endElement(RDF, "Description", "rdf:Description");
+ endPrefixMapping(prefix);
+
+ this.uri = null;
+ this.prefix = null;
+ }
+
+ public void property(String name, String value) throws SAXException {
+ String qname = prefix + ":" + name;
+ startElement(uri, name, qname, EMPTY_ATTRIBUTES);
+ characters(value.toCharArray(), 0, value.length());
+ endElement(uri, name, qname);
+ }
+
+ public void metadata(Metadata metadata) throws SAXException {
+ description(metadata, "xmp", XMP);
+ description(metadata, "dc", "http://purl.org/dc/elements/1.1/");
+ description(metadata, "xmpTPg", "http://ns.adobe.com/xap/1.0/t/pg/");
+ description(metadata, "xmpRigths", "http://ns.adobe.com/xap/1.0/rights/");
+ description(metadata, "xmpMM", "http://ns.adobe.com/xap/1.0/mm/");
+ description(metadata, "xmpidq", "http://ns.adobe.com/xmp/identifier/qual/1.0/");
+ description(metadata, "xmpBJ", "http://ns.adobe.com/xap/1.0/bj/");
+ description(metadata, "xmpDM", "http://ns.adobe.com/xmp/1.0/DynamicMedia/");
+ description(metadata, "pdf", "http://ns.adobe.com/pdf/1.3/");
+ description(metadata, "photoshop", "s http://ns.adobe.com/photoshop/1.0/");
+ description(metadata, "crs", "http://ns.adobe.com/camera-raw-settings/1.0/");
+ description(metadata, "tiff", "http://ns.adobe.com/tiff/1.0/");
+ description(metadata, "exif", "http://ns.adobe.com/exif/1.0/");
+ description(metadata, "aux", "http://ns.adobe.com/exif/1.0/aux/");
+ }
+
+ private void description(Metadata metadata, String prefix, String uri)
+ throws SAXException {
+ int count = 0;
+ for (Property property : Property.getProperties(prefix)) {
+ String value = metadata.get(property);
+ if (value != null) {
+ if (count++ == 0) {
+ startDescription("", prefix, uri);
+ }
+ property(property.getName().substring(prefix.length() + 1), value);
+ }
+ }
+
+ if (count > 0) {
+ endDescription();
+ }
+ }
+
+}