You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/17 12:15:47 UTC

svn commit: r1171936 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: hdf/HDFParser.java netcdf/NetCDFParser.java

Author: jukka
Date: Sat Sep 17 10:15:46 2011
New Revision: 1171936

URL: http://svn.apache.org/viewvc?rev=1171936&view=rev
Log:
TIKA-598: Update HDF parser and NetCDF parser to emit minimal XHTML

Also some streamlining and better error handling

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java?rev=1171936&r1=1171935&r2=1171936&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/hdf/HDFParser.java Sat Sep 17 10:15:46 2011
@@ -26,11 +26,13 @@ import java.util.Set;
 
 //TIKA imports
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.netcdf.NetCDFParser;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -50,8 +52,8 @@ import ucar.nc2.NetcdfFile;
  */
 public class HDFParser extends AbstractParser {
 
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections
-            .singleton(MediaType.application("x-hdf"));
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("x-hdf"));
 
     /*
      * (non-Javadoc)
@@ -76,10 +78,22 @@ public class HDFParser extends AbstractP
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
         ByteArrayOutputStream os = new ByteArrayOutputStream();
-        this.writeStreamToMemory(stream, os);
+        IOUtils.copy(stream, os);
 
-        NetcdfFile ncFile = NetcdfFile.openInMemory("", os.toByteArray());
-        this.unravelStringMet(ncFile, null, metadata);
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name == null) {
+            name = "";
+        }
+        try {
+            NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+            unravelStringMet(ncFile, null, metadata);
+        } catch (IOException e) {
+            throw new TikaException("HDF parse error", e);
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
     }
 
     protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
@@ -103,18 +117,4 @@ public class HDFParser extends AbstractP
         }
     }
 
-    protected void writeStreamToMemory(InputStream is, ByteArrayOutputStream os)
-            throws TikaException {
-        byte[] buf = new byte[512];
-
-        try {
-            while ((is.read(buf, 0, 512)) != -1) {
-                os.write(buf);
-            }
-        } catch (Exception e) {
-            e.printStackTrace();
-            throw new TikaException(e.getMessage());
-        }
-    }
-
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java?rev=1171936&r1=1171935&r2=1171936&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java Sat Sep 17 10:15:46 2011
@@ -25,11 +25,13 @@ import java.util.Set;
 
 //TIKA imports
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -46,8 +48,8 @@ import ucar.nc2.NetcdfFile;
  */
 public class NetCDFParser extends AbstractParser {
 
-    private final Set<MediaType> SUPPORTED_TYPES = Collections
-            .singleton(MediaType.application("x-netcdf"));
+    private final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.application("x-netcdf"));
 
     /*
      * (non-Javadoc)
@@ -70,37 +72,34 @@ public class NetCDFParser extends Abstra
     public void parse(InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
-
         ByteArrayOutputStream os = new ByteArrayOutputStream();
-        this.writeStreamToMemory(stream, os);
-
-        NetcdfFile ncFile = NetcdfFile.openInMemory("", os.toByteArray());
+        IOUtils.copy(stream, os);
 
-        // first parse out the set of global attributes
-        for (Attribute attr : ncFile.getGlobalAttributes()) {
-            String attrName = attr.getName();
-            if (attr.getDataType().isString()) {
-                metadata.add(attrName, attr.getStringValue());
-            } else if (attr.getDataType().isNumeric()) {
-                metadata.add(attrName, String.valueOf(attr.getNumericValue()
-                        .intValue()));
-            }
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name == null) {
+            name = "";
         }
 
-    }
-
-    protected void writeStreamToMemory(InputStream is, ByteArrayOutputStream os)
-            throws TikaException {
-        byte[] buf = new byte[512];
-
         try {
-            while ((is.read(buf, 0, 512)) != -1) {
-                os.write(buf);
+            NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+
+            // first parse out the set of global attributes
+            for (Attribute attr : ncFile.getGlobalAttributes()) {
+                String attrName = attr.getName();
+                if (attr.getDataType().isString()) {
+                    metadata.add(attrName, attr.getStringValue());
+                } else if (attr.getDataType().isNumeric()) {
+                    int value = attr.getNumericValue().intValue();
+                    metadata.add(attrName, String.valueOf(value));
+                }
             }
-        } catch (Exception e) {
-            e.printStackTrace();
-            throw new TikaException(e.getMessage());
-        }
+        } catch (IOException e) {
+            throw new TikaException("NetCDF parse error", e);
+        } 
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
     }
 
 }