You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/10/18 21:11:48 UTC

svn commit: r1185805 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika-core/src/main/java/org/apache/tika/metadata/Property.java tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java

Author: jukka
Date: Tue Oct 18 19:11:47 2011
New Revision: 1185805

URL: http://svn.apache.org/viewvc?rev=1185805&view=rev
Log:
TIKA-756: XMP output from Tika CLI

First draft of XMP metadata output. WIP...

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1185805&r1=1185804&r2=1185805&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Oct 18 19:11:47 2011
@@ -75,6 +75,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XMPContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -192,14 +193,32 @@ public class TikaCLI {
     };
 
     private final OutputType JSON = new OutputType() {
-       @Override
-       protected ContentHandler getContentHandler(OutputStream output)
-               throws Exception {
-           final PrintWriter writer =
-               new PrintWriter(getOutputWriter(output, encoding));
-           return new NoDocumentJSONMetHandler(writer);
-       }
-   };
+        @Override
+        protected ContentHandler getContentHandler(OutputStream output)
+                throws Exception {
+            final PrintWriter writer =
+                    new PrintWriter(getOutputWriter(output, encoding));
+            return new NoDocumentJSONMetHandler(writer);
+        }
+    };
+
+    private final OutputType XMP = new OutputType() {
+        @Override
+        protected ContentHandler getContentHandler(OutputStream output)
+                throws Exception {
+            final ContentHandler handler =
+                    getTransformerHandler(output, "xml", encoding, prettyPrint);
+            return new DefaultHandler() {
+                @Override
+                public void endDocument() throws SAXException {
+                    XMPContentHandler xmp = new XMPContentHandler(handler);
+                    xmp.startDocument();
+                    xmp.metadata(metadata);
+                    xmp.endDocument();
+                }
+            };
+        }
+    };
 
     private final OutputType LANGUAGE = new OutputType() {
         @Override
@@ -308,7 +327,9 @@ public class TikaCLI {
         } else if (arg.startsWith("--encoding=")) {
             encoding = arg.substring("--encoding=".length());
         } else  if (arg.equals("-j") || arg.equals("--json")) {
-            type = JSON;            
+            type = JSON;
+        } else  if (arg.equals("-y") || arg.equals("--xmp")) {
+            type = XMP;
         } else if (arg.equals("-x") || arg.equals("--xml")) {
             type = XML;
         } else if (arg.equals("-h") || arg.equals("--html")) {
@@ -386,10 +407,11 @@ public class TikaCLI {
         out.println();
         out.println("    -x  or --xml           Output XHTML content (default)");
         out.println("    -h  or --html          Output HTML content");
-        out.println("    -j  or --json          Output JSON content");
         out.println("    -t  or --text          Output plain text content");
         out.println("    -T  or --text-main     Output plain text content (main content only)");
         out.println("    -m  or --metadata      Output only metadata");
+        out.println("    -j  or --json          Output metadata in JSON");
+        out.println("    -y  or --xmp           Output metadata in XMP");
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
         out.println("    -eX or --encoding=X    Use output encoding X");
@@ -430,7 +452,7 @@ public class TikaCLI {
         out.println();
         out.println("- Server mode");
         out.println();
-        out.println("    Use the \"-server\" (or \"-s\") option to start the");
+        out.println("    Use the \"--server\" (or \"-s\") option to start the");
         out.println("    Apache Tika server. The server will listen to the");
         out.println("    ports you specify as one or more arguments.");
         out.println();

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java?rev=1185805&r1=1185804&r2=1185805&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Property.java Tue Oct 18 19:11:47 2011
@@ -18,8 +18,12 @@ package org.apache.tika.metadata;
 
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
 
 /**
  * XMP property definition. Each instance of this class defines a single
@@ -30,7 +34,7 @@ import java.util.Set;
  *
  * @since Apache Tika 0.7
  */
-public final class Property {
+public final class Property implements Comparable<Property> {
 
     public static enum PropertyType {
         SIMPLE, STRUCTURE, BAG, SEQ, ALT
@@ -41,6 +45,9 @@ public final class Property {
         MIME_TYPE, PROPER_NAME, RATIONAL, REAL, TEXT, URI, URL, XPATH
     }
 
+    private static final Map<String, Property> properties =
+            new HashMap<String, Property>();
+
     private final String name;
 
     private final boolean internal;
@@ -67,6 +74,10 @@ public final class Property {
         } else {
             this.choices = null;
         }
+
+        synchronized (properties) {
+            properties.put(name, this);
+        }
     }
 
     private Property(
@@ -116,6 +127,19 @@ public final class Property {
         return choices;
     }
 
+    public static SortedSet<Property> getProperties(String prefix) {
+        SortedSet<Property> set = new TreeSet<Property>();
+        String p = prefix + ":";
+        synchronized (properties) {
+            for (String name : properties.keySet()) {
+                if (name.startsWith(p)) {
+                    set.add(properties.get(name));
+                }
+            }
+        }
+        return set;
+    }
+
     public static Property internalBoolean(String name) {
         return new Property(name, true, ValueType.BOOLEAN);
     }
@@ -182,4 +206,20 @@ public final class Property {
         return new Property(name, false, ValueType.TEXT);
     }
 
+    //----------------------------------------------------------< Comparable >
+
+    public int compareTo(Property o) {
+        return name.compareTo(o.name);
+    }
+
+    //--------------------------------------------------------------< Object >
+
+    public boolean equals(Object o) {
+        return o instanceof Property && name.equals(((Property) o).name);
+    }
+
+    public int hashCode() {
+        return name.hashCode();
+    }
+
 }

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java?rev=1185805&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java Tue Oct 18 19:11:47 2011
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that simplifies the task of producing XMP output.
+ *
+ * @since Apache Tika 1.0
+ */
+public class XMPContentHandler extends SafeContentHandler {
+
+    /**
+     * The RDF namespace URI
+     */
+    public static final String RDF =
+            "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    /**
+     * The XMP namespace URI
+     */
+    public static final String XMP =
+            "http://ns.adobe.com/xap/1.0/";
+
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+    public XMPContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    /**
+     * Starts an XMP document by setting up the namespace mappings and
+     * writing out the following header:
+     * <pre>
+     * &lt;rdf:RDF&gt;
+     * </pre>
+     */
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+
+        startPrefixMapping("rdf", RDF);
+        startPrefixMapping("xmp", XMP);
+
+        startElement(RDF, "RDF", "rdf:RDF", EMPTY_ATTRIBUTES);
+    }
+
+    /**
+     * Ends the XMP document by writing the following footer and
+     * clearing the namespace mappings:
+     * <pre>
+     * &lt;/rdf:RDF&gt;
+     * </pre>
+     */
+    @Override
+    public void endDocument() throws SAXException {
+        endElement(RDF, "RDF", "rdf:RDF");
+
+        endPrefixMapping("xmp");
+        endPrefixMapping("rdf");
+
+        super.endDocument();
+    }
+
+    //------------------------------------------< public convenience methods >
+
+    private String prefix = null;
+
+    private String uri = null;
+
+    public void startDescription(String about, String prefix, String uri)
+            throws SAXException {
+        this.prefix = prefix;
+        this.uri = uri;
+
+        startPrefixMapping(prefix, uri);
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute(RDF, "about", "rdf:about", "CDATA", about);
+        startElement(RDF, "Description", "rdf:Description", attributes);
+    }
+
+    public void endDescription() throws SAXException {
+        endElement(RDF, "Description", "rdf:Description");
+        endPrefixMapping(prefix);
+
+        this.uri = null;
+        this.prefix = null;
+    }
+
+    public void property(String name, String value) throws SAXException {
+        String qname = prefix + ":" + name;
+        startElement(uri, name, qname, EMPTY_ATTRIBUTES);
+        characters(value.toCharArray(), 0, value.length());
+        endElement(uri, name, qname);
+    }
+
+    public void metadata(Metadata metadata) throws SAXException {
+        description(metadata, "xmp", XMP);
+        description(metadata, "dc", "http://purl.org/dc/elements/1.1/");
+        description(metadata, "xmpTPg", "http://ns.adobe.com/xap/1.0/t/pg/");
+        description(metadata, "xmpRigths", "http://ns.adobe.com/xap/1.0/rights/");
+        description(metadata, "xmpMM", "http://ns.adobe.com/xap/1.0/mm/");
+        description(metadata, "xmpidq", "http://ns.adobe.com/xmp/identifier/qual/1.0/");
+        description(metadata, "xmpBJ", "http://ns.adobe.com/xap/1.0/bj/");
+        description(metadata, "xmpDM", "http://ns.adobe.com/xmp/1.0/DynamicMedia/");
+        description(metadata, "pdf", "http://ns.adobe.com/pdf/1.3/");
+        description(metadata, "photoshop", "s http://ns.adobe.com/photoshop/1.0/");
+        description(metadata, "crs", "http://ns.adobe.com/camera-raw-settings/1.0/");
+        description(metadata, "tiff", "http://ns.adobe.com/tiff/1.0/");
+        description(metadata, "exif", "http://ns.adobe.com/exif/1.0/");
+        description(metadata, "aux", "http://ns.adobe.com/exif/1.0/aux/");
+    }
+
+    private void description(Metadata metadata, String prefix, String uri)
+            throws SAXException {
+        int count = 0;
+        for (Property property : Property.getProperties(prefix)) {
+            String value = metadata.get(property);
+            if (value != null) {
+                if (count++ == 0) {
+                    startDescription("", prefix, uri);
+                }
+                property(property.getName().substring(prefix.length() + 1), value);
+            }
+        }
+
+        if (count > 0) {
+            endDescription();
+        }
+    }
+
+}