You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oodt.apache.org by ri...@apache.org on 2013/08/29 19:20:19 UTC

svn commit: r1518713 - /oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java

Author: riverma
Date: Thu Aug 29 17:20:19 2013
New Revision: 1518713

URL: http://svn.apache.org/r1518713
Log:
OODT-652 : New TikaCmdLineMetExtractor

Added:
    oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java

Added: oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java?rev=1518713&view=auto
==============================================================================
--- oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java (added)
+++ oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java Thu Aug 29 17:20:19 2013
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.logging.Logger;
+
+//OODT imports
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+import org.apache.tika.Tika;
+
+/**
+ * @author rverma
+ * @author arni
+ * @author mattmann
+ * @version $Revision$
+ * 
+ * <p>
+ * A Met Extractor that invokes Apache Tika to automatically detect
+ * relevant metadata for a given product.
+ * </p>
+ * .
+ * <p>
+ * To use this extractor, a met extractor config file must be referenced. 
+ * This can take the form of a Java properties file that includes, 
+ * at a minimum, the 'ProductType=...' metadata key specified.
+ * </p>
+ */
+public class TikaCmdLineMetExtractor extends CmdLineMetExtractor {
+
+    private static final Logger LOG = Logger
+            .getLogger(TikaCmdLineMetExtractor.class.getName());
+
+    protected static MetReaderConfigReader reader = 
+            new MetReaderConfigReader();
+
+    public TikaCmdLineMetExtractor() {
+        super(reader);
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see
+     * org.apache.oodt.cas.metadata.AbstractMetExtractor#extractMetadata(java
+     * .io.File)
+     */
+    @Override
+    public Metadata extrMetadata(File file) throws MetExtractionException {
+
+        try {
+            org.apache.tika.metadata.Metadata tikaMet = 
+                    new org.apache.tika.metadata.Metadata();
+            Metadata met = new Metadata();
+            InputStream is = new FileInputStream(file);
+
+            // extract met from prod using tika
+            LOG.fine("Invoking tika extractor on file ["
+                    + file.getAbsolutePath() + "]");
+            Tika tika = new Tika();
+            tika.parse(is, tikaMet); // extract metadata
+            tikaMet.add("content", tika.parseToString(file)); // extract content
+
+            LOG.fine("Number of captured tika metadata keys: ["
+                    + tikaMet.names().length + "]");
+
+            // copy tika met into oodt met
+            for (String key : tikaMet.names()) {
+                met.addMetadata(key, tikaMet.get(key));
+                LOG.fine("Added tika met key [" + key + "] with value ["
+                        + met.getMetadata(key) + "]");
+            }
+
+            MetReaderConfig myConfig = (MetReaderConfig) this.config;
+
+            // add config file met
+            Enumeration<Object> configMetKeys = myConfig.keys();
+            while (configMetKeys.hasMoreElements()) {
+                String configMetKey = (String) configMetKeys.nextElement();
+                String configMetKeyVal = (String) myConfig.get(configMetKey);
+
+                met.addMetadata(configMetKey, configMetKeyVal);
+                LOG.fine("Added config file met key [" + configMetKey + 
+                        "] with value [" + met.getMetadata(configMetKeyVal) + "]");
+            }
+
+            return met;
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            LOG.severe(e.getMessage());
+            throw new MetExtractionException(e.getMessage());
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        processMain(args, new TikaCmdLineMetExtractor());
+    }
+
+}