You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/04/06 09:00:07 UTC

svn commit: r931037 - in /lucene/tika/trunk: ./ tika-core/src/main/java/org/apache/tika/metadata/ tika-parent/ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/netcdf/ tika-parsers/src/test/java/org/apache/tika/parser/netcdf/ tika-parser...

Author: mattmann
Date: Tue Apr  6 07:00:07 2010
New Revision: 931037

URL: http://svn.apache.org/viewvc?rev=931037&view=rev
Log:
- basic support for netCDF parsing, as specified in TIKA-400 netCDF Tika Parser. Can extend more later, but enough support right now to commit. Includes basic unit tests.

Added:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java   (with props)
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java   (with props)
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc   (with props)
Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
    lucene/tika/trunk/tika-parent/pom.xml
    lucene/tika/trunk/tika-parsers/pom.xml

Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Tue Apr  6 07:00:07 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
 
 The most notable changes in Tika 0.8 over previous releases are:
 
+ * Support for parsing common scientific data formats including netCDF
+   was added (TIKA-400).
+
  * Unit tests for Windows have been fixed, allowing TestParsers
    to complete. (TIKA-398)
 

Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java Tue Apr  6 07:00:07 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata;
+
+/**
+ * Met keys from NCAR CCSM files in the <a
+ * href="http://cf-pcmdi.llnl.gov/">Climate Forecast Convention</a>.
+ */
+public interface ClimateForcast {
+
+    public static final String PROGRAM_ID = "prg_ID";
+
+    public static final String COMMAND_LINE = "cmd_ln";
+
+    public static final String HISTORY = "history";
+
+    public static final String TABLE_ID = "table_id";
+
+    public static final String INSTITUTION = "institution";
+
+    public static final String SOURCE = "source";
+
+    public static final String CONTACT = "contact";
+
+    public static final String PROJECT_ID = "project_id";
+
+    public static final String CONVENTIONS = "Conventions";
+
+    public static final String REFERENCES = "references";
+
+    public static final String ACKNOWLEDGEMENT = "acknowledgement";
+
+    public static final String REALIZATION = "realization";
+
+    public static final String EXPERIMENT_ID = "experiment_id";
+
+    public static final String COMMENT = "comment";
+
+    public static final String MODEL_NAME_ENGLISH = "model_name_english";
+
+}

Propchange: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java Tue Apr  6 07:00:07 2010
@@ -25,7 +25,7 @@ import java.util.Properties;
  * A multi-valued metadata container.
  */
 public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
-        MSOffice, TikaMetadataKeys, TikaMimeKeys {
+        MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys {
 
     /**
      * A map of all metadata attributes.

Modified: lucene/tika/trunk/tika-parent/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parent/pom.xml?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parent/pom.xml (original)
+++ lucene/tika/trunk/tika-parent/pom.xml Tue Apr  6 07:00:07 2010
@@ -27,6 +27,20 @@
     <artifactId>apache</artifactId>
     <version>6</version>
   </parent>
+  
+  <repositories>
+   <repository>
+     <id>netcdf-m2-repo</id>
+     <name>netCDF Java Maven2 Repository</name>
+     <url>http://ulisse.pin.unifi.it:8081/nexus/content/groups/open.repos</url>
+     <releases>
+       <enabled>true</enabled>
+     </releases>
+     <snapshots>
+       <enabled>true</enabled>
+     </snapshots>
+  </repository> 
+ </repositories>
 
   <groupId>org.apache.tika</groupId>
   <artifactId>tika-parent</artifactId>

Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Tue Apr  6 07:00:07 2010
@@ -44,6 +44,27 @@
       <artifactId>tika-core</artifactId>
       <version>${project.version}</version>
     </dependency>
+	<dependency>
+	  <groupId>essi-unidata</groupId>
+	  <artifactId>netcdf-java</artifactId>
+	  <version>4.0.41</version>
+	</dependency>
+	<dependency>
+	  <groupId>commons-httpclient</groupId>
+	  <artifactId>commons-httpclient</artifactId>
+	  <version>3.1</version>
+	</dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.5.6</version>
+    </dependency>
+	<dependency>
+	  <groupId>org.slf4j</groupId>
+	  <artifactId>slf4j-log4j12</artifactId>
+	  <version>1.5.6</version>
+	  <scope>test</scope>
+	</dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java Tue Apr  6 07:00:07 2010
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+//TIKA imports
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//NETCDF imports
+import ucar.nc2.Attribute;
+import ucar.nc2.NetcdfFile;
+
+/**
+ * A {@link Parser} for <a
+ * href="http://www.unidata.ucar.edu/software/netcdf/index.html">NetCDF</a>
+ * files using the UCAR, MIT-licensed <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java/">NetCDF for Java</a>
+ * API.
+ */
+public class NetCDFParser implements Parser {
+
+    private final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("application/x-netcdf"));
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see
+     * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser
+     * .ParseContext)
+     */
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+     * org.apache.tika.parser.ParseContext)
+     */
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        this.writeStreamToMemory(stream, os);
+
+        NetcdfFile ncFile = NetcdfFile.openInMemory("", os.toByteArray());
+
+        // first parse out the set of global attributes
+        for (Attribute attr : ncFile.getGlobalAttributes()) {
+            String attrName = attr.getName();
+            if (attr.getDataType().isString()) {
+                metadata.add(attrName, attr.getStringValue());
+            } else if (attr.getDataType().isNumeric()) {
+                metadata.add(attrName, String.valueOf(attr.getNumericValue()
+                        .intValue()));
+            }
+        }
+
+    }
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
+     */
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
+        this.parse(stream, handler, metadata, new ParseContext());
+    }
+
+    private void writeStreamToMemory(InputStream is, ByteArrayOutputStream os)
+            throws TikaException {
+        byte[] buf = new byte[512];
+
+        try {
+            while ((is.read(buf, 0, 512)) != -1) {
+                os.write(buf);
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new TikaException(e.getMessage());
+        }
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (added)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java Tue Apr  6 07:00:07 2010
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+//Junit imports
+import junit.framework.TestCase;
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ * 
+ */
+public class NetCDFParserTest extends TestCase {
+
+    public void testParseGlobalMetadata() throws Exception {
+        Parser parser = new NetCDFParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = NetCDFParser.class
+                .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals(metadata.get(Metadata.TITLE),
+                "model output prepared for IPCC AR4");
+        assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+        assertEquals(metadata.get(Metadata.PROJECT_ID),
+                "IPCC Fourth Assessment");
+        assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+        assertEquals(metadata.get(Metadata.REALIZATION), "1");
+        assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
+                "720 ppm stabilization experiment (SRESA1B)");
+
+    }
+
+}

Propchange: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc?rev=931037&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream