You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2015/01/29 22:52:47 UTC

svn commit: r1655873 - in /tika/trunk: ./ tika-bundle/ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/grib/ tika-parsers/src/main/resources/META-INF/services/ tika-parsers/src/test/java/org/apache/tika/parser/grib/ tika-parsers/src/tes...

Author: lewismc
Date: Thu Jan 29 21:52:47 2015
New Revision: 1655873

URL: http://svn.apache.org/r1655873
Log:
TIKA-1423 Build a parser to extract data from GRIB formats

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-bundle/pom.xml
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Jan 29 21:52:47 2015
@@ -1,4 +1,6 @@
 Release 1.8 - Current Development
+  * Build a parser to extract data from GRIB formats (TIKA-1423)
+
   * Upgrade to Commons Compress 1.9 (TIKA-1534).
 
   * Include media duration in metadata parsed by MP4Parser (TIKA-1530).

Modified: tika/trunk/tika-bundle/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Thu Jan 29 21:52:47 2015
@@ -125,7 +125,8 @@
               boilerpipe, rome,
               apache-mime4j-core, apache-mime4j-dom,
               jhighlight, java-libpst,
-              netcdf, jcip-annotations, jmatio
+              netcdf4, grib, cdm, httpservices, jcip-annotations, 
+              jmatio, guava
             </Embed-Dependency>
             <Embed-Transitive>true</Embed-Transitive>
             <Bundle-DocURL>${project.url}</Bundle-DocURL>
@@ -206,6 +207,9 @@
               org.jdom;resolution:=optional,
               org.jdom.input;resolution:=optional,
               org.jdom.output;resolution:=optional,
+              org.jdom2;resolution:=optional,
+              org.jdom2.input;resolution:=optional,
+              org.jdom2.output;resolution:=optional,
               org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
               org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
               org.osgi.framework;resolution:=optional,
@@ -221,6 +225,25 @@
               schemasMicrosoftComOfficePowerpoint;resolution:=optional,
               schemasMicrosoftComOfficeWord;resolution:=optional,
               sun.misc;resolution:=optional,
+              ucar.units;resolution:=optional,
+              ucar.httpservices;resolution:=optional,
+              ucar.nc2.util;resolution:=optional,
+              ucar.nc2.util.cache;resolution:=optional,
+              ucar.nc2.dataset;resolution:=optional,
+              ucar.nc2;resolution:=optional,
+              ucar.nc2.constants;resolution:=optional,
+              ucar.nc2.dt;resolution:=optional,
+              ucar.nc2.dt.grid;resolution:=optional,
+              ucar.nc2.ft;resolution:=optional,
+              ucar.nc2.iosp;resolution:=optional,
+              ucar.nc2.iosp.hdf4;resolution:=optional,
+              ucar.nc2.ncml;resolution:=optional,
+              ucar.nc2.stream;resolution:=optional,
+              ucar.nc2.time;resolution:=optional,
+              ucar.nc2.units;resolution:=optional,
+              ucar.nc2.wmo;resolution:=optional,
+              ucar.nc2.write;resolution:=optional,
+              ucar.ma2;resolution:=optional,
               ucar.grib;resolution:=optional,
               ucar.grib.grib1;resolution:=optional,
               ucar.grib.grib2;resolution:=optional,
@@ -236,7 +259,56 @@
               visad.data;resolution:=optional,
               visad.data.vis5d;resolution:=optional,
               visad.jmet;resolution:=optional,
-              visad.util;resolution:=optional
+              visad.util;resolution:=optional,
+              colorspace;resolution:=optional,
+              com.sun.jna;resolution:=optional,
+              com.sun.jna.ptr;resolution:=optional,
+              icc;resolution:=optional,
+              jj2000.j2k.codestream;resolution:=optional,
+              jj2000.j2k.codestream.reader;resolution:=optional,
+              jj2000.j2k.decoder;resolution:=optional,
+              jj2000.j2k.entropy.decoder;resolution:=optional,
+              jj2000.j2k.fileformat.reader;resolution:=optional,
+              jj2000.j2k.image;resolution:=optional,
+              jj2000.j2k.image.invcomptransf;resolution:=optional,
+              jj2000.j2k.image.output;resolution:=optional,
+              jj2000.j2k.io;resolution:=optional,
+              jj2000.j2k.quantization.dequantizer;resolution:=optional,
+              jj2000.j2k.roi;resolution:=optional,
+              jj2000.j2k.util;resolution:=optional,
+              jj2000.j2k.wavelet.synthesis;resolution:=optional,
+              org.itadaki.bzip2;resolution:=optional,
+              org.jsoup;resolution:=optional,
+              org.jsoup.nodes;resolution:=optional,
+              org.jsoup.select;resolution:=optional,
+              thredds.featurecollection;resolution:=optional,
+              thredds.filesystem;resolution:=optional,
+              thredds.inventory;resolution:=optional,
+              thredds.inventory.filter;resolution:=optional,
+              thredds.inventory.partition;resolution:=optional,
+              com.beust.jcommander;resolution:=optional,
+              com.google.common.base;resolution:=optional,
+              com.google.common.math;resolution:=optional,
+              org.apache.http;resolution:=optional,
+              org.joda.time;resolution:=optional,
+              org.joda.time.chrono;resolution:=optional,
+              org.joda.time.field;resolution:=optional,
+              org.joda.time.format;resolution:=optional,
+              sun.reflect.generics.reflectiveObjects;resolution:=optional,
+              org.apache.http.auth;resolution:=optional,
+              org.apache.http.client;resolution:=optional,
+              org.apache.http.client.entity;resolution:=optional,
+              org.apache.http.client.methods;resolution:=optional,
+              org.apache.http.conn;resolution:=optional,
+              org.apache.http.conn.scheme;resolution:=optional,
+              org.apache.http.cookie;resolution:=optional,
+              org.apache.http.entity;resolution:=optional,
+              org.apache.http.impl.client;resolution:=optional,
+              org.apache.http.impl.conn;resolution:=optional,
+              org.apache.http.message;resolution:=optional,
+              org.apache.http.params;resolution:=optional,
+              org.apache.http.protocol;resolution:=optional,
+              org.apache.http.util;resolution:=optional
             </Import-Package>
           </instructions>
         </configuration>

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Jan 29 21:52:47 2015
@@ -44,6 +44,7 @@
     <mime4j.version>0.7.2</mime4j.version>
     <vorbis.version>0.6</vorbis.version>
     <pdfbox.version>1.8.8</pdfbox.version>
+    <netcdf-java.version>4.5.4</netcdf-java.version>
   </properties>
 
   <dependencies>
@@ -78,11 +79,6 @@
 
     <!-- Upstream parser libraries -->
     <dependency>
-      <groupId>edu.ucar</groupId>
-      <artifactId>netcdf</artifactId>
-      <version>4.2.20</version>
-    </dependency>
-    <dependency>
       <groupId>net.sourceforge.jmatio</groupId>
       <artifactId>jmatio</artifactId>
       <version>1.0</version>
@@ -234,6 +230,33 @@
       <artifactId>slf4j-log4j12</artifactId>
       <scope>test</scope>
     </dependency>
+
+    <!-- edu.ucar dependencies -->
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>netcdf4</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>grib</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>cdm</artifactId>
+      <version>${netcdf-java.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>edu.ucar</groupId>
+      <artifactId>httpservices</artifactId>
+      <version>${netcdf-java.version}</version>
+   </dependency>
+   <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <version>10.0.1</version>
+    </dependency>
   </dependencies>
 
   <build>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java?rev=1655873&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java Thu Jan 29 21:52:47 2015
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.File;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import ucar.nc2.Attribute;
+import ucar.nc2.Dimension;
+import ucar.nc2.NetcdfFile;
+import ucar.nc2.Variable;
+import ucar.nc2.dataset.NetcdfDataset;
+
+public class GribParser extends AbstractParser {
+
+    private static final long serialVersionUID = 7855458954474247655L;
+
+    public static final String GRIB_MIME_TYPE = "application/x-grib2";
+
+    private final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-grib2"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        //Set MIME type as grib2
+        metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
+
+        TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+        File gribFile = tis.getFile();
+
+        try {
+            NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
+
+            // first parse out the set of global attributes
+            for (Attribute attr : ncFile.getGlobalAttributes()) {
+                Property property = resolveMetadataKey(attr.getFullName());
+                if (attr.getDataType().isString()) {
+                    metadata.add(property, attr.getStringValue());
+                } else if (attr.getDataType().isNumeric()) {
+                    int value = attr.getNumericValue().intValue();
+                    metadata.add(property, String.valueOf(value));
+                }
+            }
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+            xhtml.startDocument();
+
+            xhtml.newline();
+            xhtml.startElement("ul");
+            xhtml.characters("dimensions:");
+            xhtml.newline();
+
+            for (Dimension dim : ncFile.getDimensions()){
+                xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
+                xhtml.newline();
+            }
+
+            xhtml.startElement("ul");
+            xhtml.characters("variables:");
+            xhtml.newline();
+
+            for (Variable var : ncFile.getVariables()){
+                xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
+                for(Attribute element : var.getAttributes()){
+                    xhtml.element("li", " :" + element + ";");
+                    xhtml.newline();
+                }
+            }
+            xhtml.endElement("ul");
+            xhtml.endElement("ul");
+            xhtml.endDocument();
+
+        } catch (IOException e) {
+            throw new TikaException("NetCDF parse error", e);
+        }
+    }
+
+    private Property resolveMetadataKey(String localName) {
+        if ("title".equals(localName)) {
+            return TikaCoreProperties.TITLE;
+        }
+        return Property.internalText(localName);
+    }
+
+}
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Jan 29 21:52:47 2015
@@ -57,3 +57,4 @@ org.apache.tika.parser.code.SourceCodePa
 org.apache.tika.parser.mat.MatParser
 org.apache.tika.parser.ocr.TesseractOCRParser
 org.apache.tika.parser.gdal.GDALParser
+org.apache.tika.parser.grib.GribParser

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java?rev=1655873&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java Thu Jan 29 21:52:47 2015
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+//JDK imports
+import static org.junit.Assert.*;
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.File;
+/**
+ * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}.
+ */
+
+public class GribParserTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        Parser parser = new GribParser();
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2");
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        } finally {
+            stream.close();
+        }
+        assertNotNull(metadata);
+        String content = handler.toString();
+        assertTrue(content.contains("dimensions:"));
+        assertTrue(content.contains("variables:"));
+    }
+}
+ 

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2?rev=1655873&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream