You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/05 00:31:33 UTC

svn commit: r692281 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/pkg/ src/main/java/org/apache/tika/parser/pkg/bzip2/ src/main/resources/ src/main/resources/mime/ src/test/java/org/apache/tika/parser/pkg/ src/test/resources/test-...

Author: jukka
Date: Thu Sep  4 15:31:32 2008
New Revision: 692281

URL: http://svn.apache.org/viewvc?rev=692281&view=rev
Log:
TIKA-151: Stream compression support

Implemented support for both gzip and bzip2.

Like the tar support in TIKA-150, the bzip2 support here is based on code copied from the Apache Ant project.

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/
      - copied from r692275, ant/core/trunk/src/main/org/apache/tools/bzip2/
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2   (with props)
    incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz   (with props)
Removed:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2OutputStream.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Sep  4 15:31:32 2008
@@ -66,6 +66,8 @@
 
 28. TIKA-150 - Parser for tar files (Jukka Zitting)
 
+29. TIKA-151 - Stream compression support (Jukka Zitting)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java Thu Sep  4 15:31:32 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.bzip2.CBZip2InputStream;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Gzip parser.
+ */
+public class Bzip2Parser extends PackageParser {
+
+    /**
+     * Parses the given stream as a gzip file.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        // The CBZip2InputStream class does not want to see the magic BZ header
+        if (stream.read() != 'B' || stream.read() != 'Z') {
+            throw new TikaException("Invalid BZip2 magic header");
+        }
+
+        metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // At the end we want to close the bzip2 stream to release any associated
+        // resources, but the underlying document stream should not be closed
+        InputStream gzip =
+            new CBZip2InputStream(new CloseShieldInputStream(stream));
+        try {
+            Metadata entrydata = new Metadata();
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                if (name.endsWith(".tbz")) {
+                    name = name.substring(0, name.length() - 4) + ".tar";
+                } else if (name.endsWith(".tbz2")) {
+                    name = name.substring(0, name.length() - 5) + ".tar";
+                } else if (name.endsWith(".bz")) {
+                    name = name.substring(0, name.length() - 3);
+                } else if (name.endsWith(".bz2")) {
+                    name = name.substring(0, name.length() - 4);
+                }
+                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            }
+            parseEntry(gzip, xhtml, entrydata);
+        } finally {
+            gzip.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java Thu Sep  4 15:31:32 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Gzip parser.
+ */
+public class GzipParser extends PackageParser {
+
+    /**
+     * Parses the given stream as a gzip file.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // At the end we want to close the gzip stream to release any associated
+        // resources, but the underlying document stream should not be closed
+        InputStream gzip =
+            new GZIPInputStream(new CloseShieldInputStream(stream));
+        try {
+            Metadata entrydata = new Metadata();
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                if (name.endsWith(".tgz")) {
+                    name = name.substring(0, name.length() - 4) + ".tar";
+                } else if (name.endsWith(".gz") || name.endsWith("-gz")) {
+                    name = name.substring(0, name.length() - 3);
+                }
+                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            }
+            parseEntry(gzip, xhtml, entrydata);
+        } finally {
+            gzip.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+}

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java Thu Sep  4 15:31:32 2008
@@ -20,9 +20,11 @@
  * This package is based on the work done by Keiron Liddle, Aftex Software
  * <ke...@aftexsw.com> to whom the Ant project is very grateful for his
  * great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.bzip2;
+package org.apache.tika.parser.pkg.bzip2;
 
 /**
  * Base class for both the compress and decompress classes.

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java Thu Sep  4 15:31:32 2008
@@ -20,10 +20,12 @@
  * This package is based on the work done by Keiron Liddle, Aftex Software
  * <ke...@aftexsw.com> to whom the Ant project is very grateful for his
  * great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
-package org.apache.tools.bzip2;
 
-import java.io.ByteArrayInputStream;
+package org.apache.tika.parser.pkg.bzip2;
+
 import java.io.IOException;
 import java.io.InputStream;
 

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java Thu Sep  4 15:31:32 2008
@@ -20,9 +20,11 @@
  * This package is based on the work done by Keiron Liddle, Aftex Software
  * <ke...@aftexsw.com> to whom the Ant project is very grateful for his
  * great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.bzip2;
+package org.apache.tika.parser.pkg.bzip2;
 
 /**
  * A simple class the hold and calculate the CRC for sanity checking

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Thu Sep  4 15:31:32 2008
@@ -405,6 +405,26 @@
     <glob pattern="*.tar" />
   </mime-type>
 
+  <mime-type type="application/x-gzip">
+    <magic priority="40">
+      <match value="\037\213" type="string" offset="0" />
+    </magic>
+    <glob pattern="*.tgz" />
+    <glob pattern="*.gz" />
+    <glob pattern="*-gz" />
+  </mime-type>
+
+  <mime-type type="application/x-bzip">
+    <alias type="application/x-bzip2" />
+    <magic priority="40">
+      <match value="BZh" type="string" offset="0" />
+    </magic>
+    <glob pattern="*.bz" />
+    <glob pattern="*.bz2" />
+    <glob pattern="*.tbz" />
+    <glob pattern="*.tbz2" />
+  </mime-type>
+
   <mime-type type="application/msword">
     <glob pattern="*.doc" />
     <alias type="application/vnd.ms-word" />
@@ -485,15 +505,6 @@
     <glob pattern="*.wmls" />
   </mime-type>
 
-  <mime-type type="application/x-bzip">
-    <alias type="application/x-bzip2" />
-  </mime-type>
-
-  <mime-type type="application/x-bzip-compressed-tar">
-    <glob pattern="*.tbz" />
-    <glob pattern="*.tbz2" />
-  </mime-type>
-
   <mime-type type="application/x-cdlink">
     <_comment>Virtual CD-ROM CD Image File</_comment>
     <glob pattern="*.vcd" />

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep  4 15:31:32 2008
@@ -113,6 +113,14 @@
                 <mime>application/x-tar</mime>
         </parser>
 
+        <parser name="parse-gzip" class="org.apache.tika.parser.pkg.GzipParser">
+                <mime>application/x-gzip</mime>
+        </parser>
+
+        <parser name="parse-gzip" class="org.apache.tika.parser.pkg.Bzip2Parser">
+                <mime>application/x-bzip</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Thu Sep  4 15:31:32 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing bzip2 files.
+ */
+public class Bzip2ParserTest extends TestCase {
+
+    public void testBzip2Parsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tbz2");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/x-bzip", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        System.out.println(content);
+        assertTrue(content.contains("test-documents/testEXCEL.xls"));
+        assertTrue(content.contains("Sample Excel Worksheet"));
+        assertTrue(content.contains("test-documents/testHTML.html"));
+        assertTrue(content.contains("Test Indexation Html"));
+        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+        assertTrue(content.contains("This is a sample Open Office document"));
+        assertTrue(content.contains("test-documents/testPDF.pdf"));
+        assertTrue(content.contains("Apache Tika"));
+        assertTrue(content.contains("test-documents/testPPT.ppt"));
+        assertTrue(content.contains("Sample Powerpoint Slide"));
+        assertTrue(content.contains("test-documents/testRTF.rtf"));
+        assertTrue(content.contains("indexation Word"));
+        assertTrue(content.contains("test-documents/testTXT.txt"));
+        assertTrue(content.contains("Test d'indexation de Txt"));
+        assertTrue(content.contains("test-documents/testWORD.doc"));
+        assertTrue(content.contains("This is a sample Microsoft Word Document"));
+        assertTrue(content.contains("test-documents/testXML.xml"));
+        assertTrue(content.contains("Rida Benjelloun"));
+    }
+
+}

Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Thu Sep  4 15:31:32 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing gzip files.
+ */
+public class GzipParserTest extends TestCase {
+
+    public void testGzipParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = GzipParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tgz");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        System.out.println(content);
+        assertTrue(content.contains("test-documents/testEXCEL.xls"));
+        assertTrue(content.contains("Sample Excel Worksheet"));
+        assertTrue(content.contains("test-documents/testHTML.html"));
+        assertTrue(content.contains("Test Indexation Html"));
+        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+        assertTrue(content.contains("This is a sample Open Office document"));
+        assertTrue(content.contains("test-documents/testPDF.pdf"));
+        assertTrue(content.contains("Apache Tika"));
+        assertTrue(content.contains("test-documents/testPPT.ppt"));
+        assertTrue(content.contains("Sample Powerpoint Slide"));
+        assertTrue(content.contains("test-documents/testRTF.rtf"));
+        assertTrue(content.contains("indexation Word"));
+        assertTrue(content.contains("test-documents/testTXT.txt"));
+        assertTrue(content.contains("Test d'indexation de Txt"));
+        assertTrue(content.contains("test-documents/testWORD.doc"));
+        assertTrue(content.contains("This is a sample Microsoft Word Document"));
+        assertTrue(content.contains("test-documents/testXML.xml"));
+        assertTrue(content.contains("Rida Benjelloun"));
+    }
+
+}

Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2?rev=692281&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz?rev=692281&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream