You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/05 00:31:33 UTC
svn commit: r692281 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/pkg/
src/main/java/org/apache/tika/parser/pkg/bzip2/ src/main/resources/
src/main/resources/mime/ src/test/java/org/apache/tika/parser/pkg/
src/test/resources/test-...
Author: jukka
Date: Thu Sep 4 15:31:32 2008
New Revision: 692281
URL: http://svn.apache.org/viewvc?rev=692281&view=rev
Log:
TIKA-151: Stream compression support
Implemented support for both gzip and bzip2.
Like the tar support in TIKA-150, the bzip2 support here is based on code copied from the Apache Ant project.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/
- copied from r692275, ant/core/trunk/src/main/org/apache/tools/bzip2/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2 (with props)
incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz (with props)
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2OutputStream.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Sep 4 15:31:32 2008
@@ -66,6 +66,8 @@
28. TIKA-150 - Parser for tar files (Jukka Zitting)
+29. TIKA-151 - Stream compression support (Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java Thu Sep 4 15:31:32 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.bzip2.CBZip2InputStream;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Gzip parser.
+ */
+public class Bzip2Parser extends PackageParser {
+
+ /**
+ * Parses the given stream as a gzip file.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ // The CBZip2InputStream class does not want to see the magic BZ header
+ if (stream.read() != 'B' || stream.read() != 'Z') {
+ throw new TikaException("Invalid BZip2 magic header");
+ }
+
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // At the end we want to close the bzip2 stream to release any associated
+ // resources, but the underlying document stream should not be closed
+ InputStream gzip =
+ new CBZip2InputStream(new CloseShieldInputStream(stream));
+ try {
+ Metadata entrydata = new Metadata();
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ if (name.endsWith(".tbz")) {
+ name = name.substring(0, name.length() - 4) + ".tar";
+ } else if (name.endsWith(".tbz2")) {
+ name = name.substring(0, name.length() - 5) + ".tar";
+ } else if (name.endsWith(".bz")) {
+ name = name.substring(0, name.length() - 3);
+ } else if (name.endsWith(".bz2")) {
+ name = name.substring(0, name.length() - 4);
+ }
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ parseEntry(gzip, xhtml, entrydata);
+ } finally {
+ gzip.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/GzipParser.java Thu Sep 4 15:31:32 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Gzip parser.
+ */
+public class GzipParser extends PackageParser {
+
+ /**
+ * Parses the given stream as a gzip file.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // At the end we want to close the gzip stream to release any associated
+ // resources, but the underlying document stream should not be closed
+ InputStream gzip =
+ new GZIPInputStream(new CloseShieldInputStream(stream));
+ try {
+ Metadata entrydata = new Metadata();
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ if (name.endsWith(".tgz")) {
+ name = name.substring(0, name.length() - 4) + ".tar";
+ } else if (name.endsWith(".gz") || name.endsWith("-gz")) {
+ name = name.substring(0, name.length() - 3);
+ }
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ parseEntry(gzip, xhtml, entrydata);
+ } finally {
+ gzip.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/BZip2Constants.java Thu Sep 4 15:31:32 2008
@@ -20,9 +20,11 @@
* This package is based on the work done by Keiron Liddle, Aftex Software
* <ke...@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.bzip2;
+package org.apache.tika.parser.pkg.bzip2;
/**
* Base class for both the compress and decompress classes.
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CBZip2InputStream.java Thu Sep 4 15:31:32 2008
@@ -20,10 +20,12 @@
* This package is based on the work done by Keiron Liddle, Aftex Software
* <ke...@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.bzip2;
-import java.io.ByteArrayInputStream;
+package org.apache.tika.parser.pkg.bzip2;
+
import java.io.IOException;
import java.io.InputStream;
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java?rev=692281&r1=692275&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/bzip2/CRC.java Thu Sep 4 15:31:32 2008
@@ -20,9 +20,11 @@
* This package is based on the work done by Keiron Liddle, Aftex Software
* <ke...@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.bzip2;
+package org.apache.tika.parser.pkg.bzip2;
/**
* A simple class the hold and calculate the CRC for sanity checking
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Thu Sep 4 15:31:32 2008
@@ -405,6 +405,26 @@
<glob pattern="*.tar" />
</mime-type>
+ <mime-type type="application/x-gzip">
+ <magic priority="40">
+ <match value="\037\213" type="string" offset="0" />
+ </magic>
+ <glob pattern="*.tgz" />
+ <glob pattern="*.gz" />
+ <glob pattern="*-gz" />
+ </mime-type>
+
+ <mime-type type="application/x-bzip">
+ <alias type="application/x-bzip2" />
+ <magic priority="40">
+ <match value="BZh" type="string" offset="0" />
+ </magic>
+ <glob pattern="*.bz" />
+ <glob pattern="*.bz2" />
+ <glob pattern="*.tbz" />
+ <glob pattern="*.tbz2" />
+ </mime-type>
+
<mime-type type="application/msword">
<glob pattern="*.doc" />
<alias type="application/vnd.ms-word" />
@@ -485,15 +505,6 @@
<glob pattern="*.wmls" />
</mime-type>
- <mime-type type="application/x-bzip">
- <alias type="application/x-bzip2" />
- </mime-type>
-
- <mime-type type="application/x-bzip-compressed-tar">
- <glob pattern="*.tbz" />
- <glob pattern="*.tbz2" />
- </mime-type>
-
<mime-type type="application/x-cdlink">
<_comment>Virtual CD-ROM CD Image File</_comment>
<glob pattern="*.vcd" />
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692281&r1=692280&r2=692281&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 15:31:32 2008
@@ -113,6 +113,14 @@
<mime>application/x-tar</mime>
</parser>
+ <parser name="parse-gzip" class="org.apache.tika.parser.pkg.GzipParser">
+ <mime>application/x-gzip</mime>
+ </parser>
+
+ <parser name="parse-gzip" class="org.apache.tika.parser.pkg.Bzip2Parser">
+ <mime>application/x-bzip</mime>
+ </parser>
+
</parsers>
</properties>
\ No newline at end of file
Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Thu Sep 4 15:31:32 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing bzip2 files.
+ */
+public class Bzip2ParserTest extends TestCase {
+
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-bzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ System.out.println(content);
+ assertTrue(content.contains("test-documents/testEXCEL.xls"));
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("test-documents/testHTML.html"));
+ assertTrue(content.contains("Test Indexation Html"));
+ assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+ assertTrue(content.contains("This is a sample Open Office document"));
+ assertTrue(content.contains("test-documents/testPDF.pdf"));
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("test-documents/testPPT.ppt"));
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("test-documents/testRTF.rtf"));
+ assertTrue(content.contains("indexation Word"));
+ assertTrue(content.contains("test-documents/testTXT.txt"));
+ assertTrue(content.contains("Test d'indexation de Txt"));
+ assertTrue(content.contains("test-documents/testWORD.doc"));
+ assertTrue(content.contains("This is a sample Microsoft Word Document"));
+ assertTrue(content.contains("test-documents/testXML.xml"));
+ assertTrue(content.contains("Rida Benjelloun"));
+ }
+
+}
Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=692281&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Thu Sep 4 15:31:32 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing gzip files.
+ */
+public class GzipParserTest extends TestCase {
+
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = GzipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ System.out.println(content);
+ assertTrue(content.contains("test-documents/testEXCEL.xls"));
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("test-documents/testHTML.html"));
+ assertTrue(content.contains("Test Indexation Html"));
+ assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+ assertTrue(content.contains("This is a sample Open Office document"));
+ assertTrue(content.contains("test-documents/testPDF.pdf"));
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("test-documents/testPPT.ppt"));
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("test-documents/testRTF.rtf"));
+ assertTrue(content.contains("indexation Word"));
+ assertTrue(content.contains("test-documents/testTXT.txt"));
+ assertTrue(content.contains("Test d'indexation de Txt"));
+ assertTrue(content.contains("test-documents/testWORD.doc"));
+ assertTrue(content.contains("This is a sample Microsoft Word Document"));
+ assertTrue(content.contains("test-documents/testXML.xml"));
+ assertTrue(content.contains("Rida Benjelloun"));
+ }
+
+}
Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2?rev=692281&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tbz2
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz?rev=692281&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tgz
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream