You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/14 12:21:32 UTC
svn commit: r1651619 - in /tika/trunk: tika-bundle/ tika-parsers/
tika-parsers/src/main/java/org/apache/tika/parser/pkg/
tika-parsers/src/main/resources/META-INF/services/
tika-parsers/src/test/java/org/apache/tika/parser/pkg/
Author: nick
Date: Wed Jan 14 11:21:32 2015
New Revision: 1651619
URL: http://svn.apache.org/r1651619
Log:
TIKA-241 Unrar parser from Luis Filipe Nassif
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
Modified:
tika/trunk/tika-bundle/pom.xml
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-bundle/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Wed Jan 14 11:21:32 2015
@@ -112,7 +112,7 @@
</Bundle-Activator>
<Embed-Dependency>
tika-parsers;inline=true,
- commons-compress, xz, commons-codec,
+ commons-compress, xz, commons-codec, junrar,
pdfbox,fontbox,jempbox,bcmail-jdk15,bcprov-jdk15,
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
xmlbeans,
@@ -171,6 +171,9 @@
org.apache.commons.httpclient.params;resolution:=optional,
org.apache.commons.httpclient.protocol;resolution:=optional,
org.apache.commons.httpclient.util;resolution:=optional,
+ org.apache.commons.vfs2;resolution:=optional,
+ org.apache.commons.vfs2.provider;resolution:=optional,
+ org.apache.commons.vfs2.util;resolution:=optional,
org.apache.crimson.jaxp;resolution:=optional,
org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
org.apache.tools.ant;resolution:=optional,
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Jan 14 11:21:32 2015
@@ -212,6 +212,11 @@
<artifactId>java-libpst</artifactId>
<version>0.8.1</version>
</dependency>
+ <dependency>
+ <groupId>com.github.junrar</groupId>
+ <artifactId>junrar</artifactId>
+ <version>0.7</version>
+ </dependency>
<!-- Test dependencies -->
<dependency>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1651619&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed Jan 14 11:21:32 2015
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.github.junrar.Archive;
+import com.github.junrar.exception.RarException;
+import com.github.junrar.rarfile.FileHeader;
+
+/**
+ * Parser for Rar files.
+ */
+public class RarParser extends AbstractParser {
+ private static final long serialVersionUID = 6157727985054451501L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.application("x-rar-compressed"));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ EmbeddedDocumentExtractor extractor = context.get(
+ EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ TemporaryResources tmp = new TemporaryResources();
+ Archive rar = null;
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ rar = new Archive(tis.getFile());
+
+ if (rar.isEncrypted()) {
+ throw new EncryptedDocumentException();
+ }
+
+ //Without this BodyContentHandler does not work
+ xhtml.element("div", " ");
+
+ FileHeader header = rar.nextFileHeader();
+ while (header != null && !Thread.currentThread().isInterrupted()) {
+
+ if (!header.isDirectory()) {
+
+ InputStream subFile = null;
+ try {
+
+ subFile = rar.getInputStream(header);
+
+ Metadata entrydata = new Metadata();
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, header
+ .getFileNameString().replace("\\", "/"));
+ entrydata.set(TikaCoreProperties.CREATED,
+ header.getCTime());
+ entrydata.set(TikaCoreProperties.MODIFIED,
+ header.getMTime());
+ entrydata.set(Metadata.CONTENT_LENGTH,
+ Long.toString(header.getFullUnpackSize()));
+
+ if (extractor.shouldParseEmbedded(entrydata))
+ extractor.parseEmbedded(subFile, handler,
+ entrydata, true);
+
+ } finally {
+ if (subFile != null)
+ subFile.close();
+ }
+ }
+
+ header = rar.nextFileHeader();
+ }
+
+ } catch (RarException e) {
+ throw new TikaException("RarParser Exception", e);
+ } finally {
+ if (rar != null)
+ rar.close();
+ tmp.close();
+ }
+
+ xhtml.endDocument();
+ }
+}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 14 11:21:32 2015
@@ -46,6 +46,7 @@ org.apache.tika.parser.odf.OpenDocumentP
org.apache.tika.parser.pdf.PDFParser
org.apache.tika.parser.pkg.CompressorParser
org.apache.tika.parser.pkg.PackageParser
+org.apache.tika.parser.pkg.RarParser
org.apache.tika.parser.rtf.RTFParser
org.apache.tika.parser.txt.TXTParser
org.apache.tika.parser.video.FLVParser
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1651619&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Wed Jan 14 11:21:32 2015
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing rar files.
+ */
+public class RarParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testRarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = RarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertTrue(content.contains("test-documents/testEXCEL.xls"));
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("test-documents/testHTML.html"));
+ assertTrue(content.contains("Test Indexation Html"));
+ assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+ assertTrue(content.contains("This is a sample Open Office document"));
+ assertTrue(content.contains("test-documents/testPDF.pdf"));
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("test-documents/testPPT.ppt"));
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("test-documents/testRTF.rtf"));
+ assertTrue(content.contains("indexation Word"));
+ assertTrue(content.contains("test-documents/testTXT.txt"));
+ assertTrue(content.contains("Test d'indexation de Txt"));
+ assertTrue(content.contains("test-documents/testWORD.doc"));
+ assertTrue(content.contains("This is a sample Microsoft Word Document"));
+ assertTrue(content.contains("test-documents/testXML.xml"));
+ assertTrue(content.contains("Rida Benjelloun"));
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = RarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names but not content types, as rar doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+}
\ No newline at end of file