You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@oodt.apache.org by ri...@apache.org on 2013/08/29 19:20:19 UTC
svn commit: r1518713 -
/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java
Author: riverma
Date: Thu Aug 29 17:20:19 2013
New Revision: 1518713
URL: http://svn.apache.org/r1518713
Log:
OODT-652 : New TikaCmdLineMetExtractor
Added:
oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java
Added: oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java
URL: http://svn.apache.org/viewvc/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java?rev=1518713&view=auto
==============================================================================
--- oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java (added)
+++ oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/TikaCmdLineMetExtractor.java Thu Aug 29 17:20:19 2013
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.metadata.extractors;
+
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.Enumeration;
+import java.util.logging.Logger;
+
+//OODT imports
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+import org.apache.tika.Tika;
+
+/**
+ * @author rverma
+ * @author arni
+ * @author mattmann
+ * @version $Revision$
+ *
+ * <p>
+ * A Met Extractor that invokes Apache Tika to automatically detect
+ * relevant metadata for a given product.
+ * </p>
+ * .
+ * <p>
+ * To use this extractor, a met extractor config file must be referenced.
+ * This can take the form of a Java properties file that includes,
+ * at a minimum, the 'ProductType=...' metadata key specified.
+ * </p>
+ */
+public class TikaCmdLineMetExtractor extends CmdLineMetExtractor {
+
+ private static final Logger LOG = Logger
+ .getLogger(TikaCmdLineMetExtractor.class.getName());
+
+ protected static MetReaderConfigReader reader =
+ new MetReaderConfigReader();
+
+ public TikaCmdLineMetExtractor() {
+ super(reader);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.oodt.cas.metadata.AbstractMetExtractor#extractMetadata(java
+ * .io.File)
+ */
+ @Override
+ public Metadata extrMetadata(File file) throws MetExtractionException {
+
+ try {
+ org.apache.tika.metadata.Metadata tikaMet =
+ new org.apache.tika.metadata.Metadata();
+ Metadata met = new Metadata();
+ InputStream is = new FileInputStream(file);
+
+ // extract met from prod using tika
+ LOG.fine("Invoking tika extractor on file ["
+ + file.getAbsolutePath() + "]");
+ Tika tika = new Tika();
+ tika.parse(is, tikaMet); // extract metadata
+ tikaMet.add("content", tika.parseToString(file)); // extract content
+
+ LOG.fine("Number of captured tika metadata keys: ["
+ + tikaMet.names().length + "]");
+
+ // copy tika met into oodt met
+ for (String key : tikaMet.names()) {
+ met.addMetadata(key, tikaMet.get(key));
+ LOG.fine("Added tika met key [" + key + "] with value ["
+ + met.getMetadata(key) + "]");
+ }
+
+ MetReaderConfig myConfig = (MetReaderConfig) this.config;
+
+ // add config file met
+ Enumeration<Object> configMetKeys = myConfig.keys();
+ while (configMetKeys.hasMoreElements()) {
+ String configMetKey = (String) configMetKeys.nextElement();
+ String configMetKeyVal = (String) myConfig.get(configMetKey);
+
+ met.addMetadata(configMetKey, configMetKeyVal);
+ LOG.fine("Added config file met key [" + configMetKey +
+ "] with value [" + met.getMetadata(configMetKeyVal) + "]");
+ }
+
+ return met;
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.severe(e.getMessage());
+ throw new MetExtractionException(e.getMessage());
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ processMain(args, new TikaCmdLineMetExtractor());
+ }
+
+}