You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/04/06 09:00:07 UTC
svn commit: r931037 - in /lucene/tika/trunk: ./
tika-core/src/main/java/org/apache/tika/metadata/ tika-parent/
tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/netcdf/
tika-parsers/src/test/java/org/apache/tika/parser/netcdf/ tika-parser...
Author: mattmann
Date: Tue Apr 6 07:00:07 2010
New Revision: 931037
URL: http://svn.apache.org/viewvc?rev=931037&view=rev
Log:
- basic support for netCDF parsing, as specified in TIKA-400 netCDF Tika Parser. Can extend more later, but enough support right now to commit. Includes basic unit tests.
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java (with props)
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (with props)
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc (with props)
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
lucene/tika/trunk/tika-parent/pom.xml
lucene/tika/trunk/tika-parsers/pom.xml
Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Tue Apr 6 07:00:07 2010
@@ -4,6 +4,9 @@ Release 0.8 - Current Development
The most notable changes in Tika 0.8 over previous releases are:
+ * Support for parsing common scientific data formats including netCDF
+ was added (TIKA-400).
+
* Unit tests for Windows have been fixed, allowing TestParsers
to complete. (TIKA-398)
Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java Tue Apr 6 07:00:07 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata;
+
+/**
+ * Met keys from NCAR CCSM files in the <a
+ * href="http://cf-pcmdi.llnl.gov/">Climate Forecast Convention</a>.
+ */
+public interface ClimateForcast {
+
+ public static final String PROGRAM_ID = "prg_ID";
+
+ public static final String COMMAND_LINE = "cmd_ln";
+
+ public static final String HISTORY = "history";
+
+ public static final String TABLE_ID = "table_id";
+
+ public static final String INSTITUTION = "institution";
+
+ public static final String SOURCE = "source";
+
+ public static final String CONTACT = "contact";
+
+ public static final String PROJECT_ID = "project_id";
+
+ public static final String CONVENTIONS = "Conventions";
+
+ public static final String REFERENCES = "references";
+
+ public static final String ACKNOWLEDGEMENT = "acknowledgement";
+
+ public static final String REALIZATION = "realization";
+
+ public static final String EXPERIMENT_ID = "experiment_id";
+
+ public static final String COMMENT = "comment";
+
+ public static final String MODEL_NAME_ENGLISH = "model_name_english";
+
+}
Propchange: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java Tue Apr 6 07:00:07 2010
@@ -25,7 +25,7 @@ import java.util.Properties;
* A multi-valued metadata container.
*/
public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
- MSOffice, TikaMetadataKeys, TikaMimeKeys {
+ MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys {
/**
* A map of all metadata attributes.
Modified: lucene/tika/trunk/tika-parent/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parent/pom.xml?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parent/pom.xml (original)
+++ lucene/tika/trunk/tika-parent/pom.xml Tue Apr 6 07:00:07 2010
@@ -27,6 +27,20 @@
<artifactId>apache</artifactId>
<version>6</version>
</parent>
+
+ <repositories>
+ <repository>
+ <id>netcdf-m2-repo</id>
+ <name>netCDF Java Maven2 Repository</name>
+ <url>http://ulisse.pin.unifi.it:8081/nexus/content/groups/open.repos</url>
+ <releases>
+ <enabled>true</enabled>
+ </releases>
+ <snapshots>
+ <enabled>true</enabled>
+ </snapshots>
+ </repository>
+ </repositories>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
Modified: lucene/tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/pom.xml?rev=931037&r1=931036&r2=931037&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/pom.xml (original)
+++ lucene/tika/trunk/tika-parsers/pom.xml Tue Apr 6 07:00:07 2010
@@ -44,6 +44,27 @@
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>essi-unidata</groupId>
+ <artifactId>netcdf-java</artifactId>
+ <version>4.0.41</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <version>3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.5.6</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>1.5.6</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java Tue Apr 6 07:00:07 2010
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+//TIKA imports
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//NETCDF imports
+import ucar.nc2.Attribute;
+import ucar.nc2.NetcdfFile;
+
+/**
+ * A {@link Parser} for <a
+ * href="http://www.unidata.ucar.edu/software/netcdf/index.html">NetCDF</a>
+ * files using the UCAR, MIT-licensed <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java/">NetCDF for Java</a>
+ * API.
+ */
+public class NetCDFParser implements Parser {
+
+ private final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.application("application/x-netcdf"));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser
+ * .ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ this.writeStreamToMemory(stream, os);
+
+ NetcdfFile ncFile = NetcdfFile.openInMemory("", os.toByteArray());
+
+ // first parse out the set of global attributes
+ for (Attribute attr : ncFile.getGlobalAttributes()) {
+ String attrName = attr.getName();
+ if (attr.getDataType().isString()) {
+ metadata.add(attrName, attr.getStringValue());
+ } else if (attr.getDataType().isNumeric()) {
+ metadata.add(attrName, String.valueOf(attr.getNumericValue()
+ .intValue()));
+ }
+ }
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+ this.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ private void writeStreamToMemory(InputStream is, ByteArrayOutputStream os)
+ throws TikaException {
+ byte[] buf = new byte[512];
+
+ try {
+ while ((is.read(buf, 0, 512)) != -1) {
+ os.write(buf);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new TikaException(e.getMessage());
+ }
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=931037&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (added)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java Tue Apr 6 07:00:07 2010
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+//Junit imports
+import junit.framework.TestCase;
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ *
+ */
+public class NetCDFParserTest extends TestCase {
+
+ public void testParseGlobalMetadata() throws Exception {
+ Parser parser = new NetCDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = NetCDFParser.class
+ .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals(metadata.get(Metadata.TITLE),
+ "model output prepared for IPCC AR4");
+ assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+ assertEquals(metadata.get(Metadata.PROJECT_ID),
+ "IPCC Fourth Assessment");
+ assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+ assertEquals(metadata.get(Metadata.REALIZATION), "1");
+ assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
+ "720 ppm stabilization experiment (SRESA1B)");
+
+ }
+
+}
Propchange: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc?rev=931037&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream