You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2015/01/29 22:52:47 UTC
svn commit: r1655873 - in /tika/trunk: ./ tika-bundle/ tika-parsers/
tika-parsers/src/main/java/org/apache/tika/parser/grib/
tika-parsers/src/main/resources/META-INF/services/
tika-parsers/src/test/java/org/apache/tika/parser/grib/
tika-parsers/src/tes...
Author: lewismc
Date: Thu Jan 29 21:52:47 2015
New Revision: 1655873
URL: http://svn.apache.org/r1655873
Log:
TIKA-1423 Build a parser to extract data from GRIB formats
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2 (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-bundle/pom.xml
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Jan 29 21:52:47 2015
@@ -1,4 +1,6 @@
Release 1.8 - Current Development
+ * Build a parser to extract data from GRIB formats (TIKA-1423)
+
* Upgrade to Commons Compress 1.9 (TIKA-1534).
* Include media duration in metadata parsed by MP4Parser (TIKA-1530).
Modified: tika/trunk/tika-bundle/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Thu Jan 29 21:52:47 2015
@@ -125,7 +125,8 @@
boilerpipe, rome,
apache-mime4j-core, apache-mime4j-dom,
jhighlight, java-libpst,
- netcdf, jcip-annotations, jmatio
+ netcdf4, grib, cdm, httpservices, jcip-annotations,
+ jmatio, guava
</Embed-Dependency>
<Embed-Transitive>true</Embed-Transitive>
<Bundle-DocURL>${project.url}</Bundle-DocURL>
@@ -206,6 +207,9 @@
org.jdom;resolution:=optional,
org.jdom.input;resolution:=optional,
org.jdom.output;resolution:=optional,
+ org.jdom2;resolution:=optional,
+ org.jdom2.input;resolution:=optional,
+ org.jdom2.output;resolution:=optional,
org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
org.osgi.framework;resolution:=optional,
@@ -221,6 +225,25 @@
schemasMicrosoftComOfficePowerpoint;resolution:=optional,
schemasMicrosoftComOfficeWord;resolution:=optional,
sun.misc;resolution:=optional,
+ ucar.units;resolution:=optional,
+ ucar.httpservices;resolution:=optional,
+ ucar.nc2.util;resolution:=optional,
+ ucar.nc2.util.cache;resolution:=optional,
+ ucar.nc2.dataset;resolution:=optional,
+ ucar.nc2;resolution:=optional,
+ ucar.nc2.constants;resolution:=optional,
+ ucar.nc2.dt;resolution:=optional,
+ ucar.nc2.dt.grid;resolution:=optional,
+ ucar.nc2.ft;resolution:=optional,
+ ucar.nc2.iosp;resolution:=optional,
+ ucar.nc2.iosp.hdf4;resolution:=optional,
+ ucar.nc2.ncml;resolution:=optional,
+ ucar.nc2.stream;resolution:=optional,
+ ucar.nc2.time;resolution:=optional,
+ ucar.nc2.units;resolution:=optional,
+ ucar.nc2.wmo;resolution:=optional,
+ ucar.nc2.write;resolution:=optional,
+ ucar.ma2;resolution:=optional,
ucar.grib;resolution:=optional,
ucar.grib.grib1;resolution:=optional,
ucar.grib.grib2;resolution:=optional,
@@ -236,7 +259,56 @@
visad.data;resolution:=optional,
visad.data.vis5d;resolution:=optional,
visad.jmet;resolution:=optional,
- visad.util;resolution:=optional
+ visad.util;resolution:=optional,
+ colorspace;resolution:=optional,
+ com.sun.jna;resolution:=optional,
+ com.sun.jna.ptr;resolution:=optional,
+ icc;resolution:=optional,
+ jj2000.j2k.codestream;resolution:=optional,
+ jj2000.j2k.codestream.reader;resolution:=optional,
+ jj2000.j2k.decoder;resolution:=optional,
+ jj2000.j2k.entropy.decoder;resolution:=optional,
+ jj2000.j2k.fileformat.reader;resolution:=optional,
+ jj2000.j2k.image;resolution:=optional,
+ jj2000.j2k.image.invcomptransf;resolution:=optional,
+ jj2000.j2k.image.output;resolution:=optional,
+ jj2000.j2k.io;resolution:=optional,
+ jj2000.j2k.quantization.dequantizer;resolution:=optional,
+ jj2000.j2k.roi;resolution:=optional,
+ jj2000.j2k.util;resolution:=optional,
+ jj2000.j2k.wavelet.synthesis;resolution:=optional,
+ org.itadaki.bzip2;resolution:=optional,
+ org.jsoup;resolution:=optional,
+ org.jsoup.nodes;resolution:=optional,
+ org.jsoup.select;resolution:=optional,
+ thredds.featurecollection;resolution:=optional,
+ thredds.filesystem;resolution:=optional,
+ thredds.inventory;resolution:=optional,
+ thredds.inventory.filter;resolution:=optional,
+ thredds.inventory.partition;resolution:=optional,
+ com.beust.jcommander;resolution:=optional,
+ com.google.common.base;resolution:=optional,
+ com.google.common.math;resolution:=optional,
+ org.apache.http;resolution:=optional,
+ org.joda.time;resolution:=optional,
+ org.joda.time.chrono;resolution:=optional,
+ org.joda.time.field;resolution:=optional,
+ org.joda.time.format;resolution:=optional,
+ sun.reflect.generics.reflectiveObjects;resolution:=optional,
+ org.apache.http.auth;resolution:=optional,
+ org.apache.http.client;resolution:=optional,
+ org.apache.http.client.entity;resolution:=optional,
+ org.apache.http.client.methods;resolution:=optional,
+ org.apache.http.conn;resolution:=optional,
+ org.apache.http.conn.scheme;resolution:=optional,
+ org.apache.http.cookie;resolution:=optional,
+ org.apache.http.entity;resolution:=optional,
+ org.apache.http.impl.client;resolution:=optional,
+ org.apache.http.impl.conn;resolution:=optional,
+ org.apache.http.message;resolution:=optional,
+ org.apache.http.params;resolution:=optional,
+ org.apache.http.protocol;resolution:=optional,
+ org.apache.http.util;resolution:=optional
</Import-Package>
</instructions>
</configuration>
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Jan 29 21:52:47 2015
@@ -44,6 +44,7 @@
<mime4j.version>0.7.2</mime4j.version>
<vorbis.version>0.6</vorbis.version>
<pdfbox.version>1.8.8</pdfbox.version>
+ <netcdf-java.version>4.5.4</netcdf-java.version>
</properties>
<dependencies>
@@ -78,11 +79,6 @@
<!-- Upstream parser libraries -->
<dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>netcdf</artifactId>
- <version>4.2.20</version>
- </dependency>
- <dependency>
<groupId>net.sourceforge.jmatio</groupId>
<artifactId>jmatio</artifactId>
<version>1.0</version>
@@ -234,6 +230,33 @@
<artifactId>slf4j-log4j12</artifactId>
<scope>test</scope>
</dependency>
+
+ <!-- edu.ucar dependencies -->
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>netcdf4</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>grib</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>cdm</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.ucar</groupId>
+ <artifactId>httpservices</artifactId>
+ <version>${netcdf-java.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>10.0.1</version>
+ </dependency>
</dependencies>
<build>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java?rev=1655873&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/grib/GribParser.java Thu Jan 29 21:52:47 2015
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.File;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import ucar.nc2.Attribute;
+import ucar.nc2.Dimension;
+import ucar.nc2.NetcdfFile;
+import ucar.nc2.Variable;
+import ucar.nc2.dataset.NetcdfDataset;
+
+public class GribParser extends AbstractParser {
+
+ private static final long serialVersionUID = 7855458954474247655L;
+
+ public static final String GRIB_MIME_TYPE = "application/x-grib2";
+
+ private final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-grib2"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ //Set MIME type as grib2
+ metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
+
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ File gribFile = tis.getFile();
+
+ try {
+ NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
+
+ // first parse out the set of global attributes
+ for (Attribute attr : ncFile.getGlobalAttributes()) {
+ Property property = resolveMetadataKey(attr.getFullName());
+ if (attr.getDataType().isString()) {
+ metadata.add(property, attr.getStringValue());
+ } else if (attr.getDataType().isNumeric()) {
+ int value = attr.getNumericValue().intValue();
+ metadata.add(property, String.valueOf(value));
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ xhtml.startDocument();
+
+ xhtml.newline();
+ xhtml.startElement("ul");
+ xhtml.characters("dimensions:");
+ xhtml.newline();
+
+ for (Dimension dim : ncFile.getDimensions()){
+ xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
+ xhtml.newline();
+ }
+
+ xhtml.startElement("ul");
+ xhtml.characters("variables:");
+ xhtml.newline();
+
+ for (Variable var : ncFile.getVariables()){
+ xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
+ for(Attribute element : var.getAttributes()){
+ xhtml.element("li", " :" + element + ";");
+ xhtml.newline();
+ }
+ }
+ xhtml.endElement("ul");
+ xhtml.endElement("ul");
+ xhtml.endDocument();
+
+ } catch (IOException e) {
+ throw new TikaException("NetCDF parse error", e);
+ }
+ }
+
+ private Property resolveMetadataKey(String localName) {
+ if ("title".equals(localName)) {
+ return TikaCoreProperties.TITLE;
+ }
+ return Property.internalText(localName);
+ }
+
+}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1655873&r1=1655872&r2=1655873&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Thu Jan 29 21:52:47 2015
@@ -57,3 +57,4 @@ org.apache.tika.parser.code.SourceCodePa
org.apache.tika.parser.mat.MatParser
org.apache.tika.parser.ocr.TesseractOCRParser
org.apache.tika.parser.gdal.GDALParser
+org.apache.tika.parser.grib.GribParser
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java?rev=1655873&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/grib/GribParserTest.java Thu Jan 29 21:52:47 2015
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+//JDK imports
+import static org.junit.Assert.*;
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.File;
+/**
+ * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}.
+ */
+
+public class GribParserTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ Parser parser = new GribParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2");
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+ assertNotNull(metadata);
+ String content = handler.toString();
+ assertTrue(content.contains("dimensions:"));
+ assertTrue(content.contains("variables:"));
+ }
+}
+
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2?rev=1655873&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/gdas1.forecmwf.2014062612.grib2
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream