You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/14 12:21:32 UTC

svn commit: r1651619 - in /tika/trunk: tika-bundle/ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/main/resources/META-INF/services/ tika-parsers/src/test/java/org/apache/tika/parser/pkg/

Author: nick
Date: Wed Jan 14 11:21:32 2015
New Revision: 1651619

URL: http://svn.apache.org/r1651619
Log:
TIKA-241 Unrar parser from Luis Filipe Nassif

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
Modified:
    tika/trunk/tika-bundle/pom.xml
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-bundle/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Wed Jan 14 11:21:32 2015
@@ -112,7 +112,7 @@
             </Bundle-Activator>
             <Embed-Dependency>
               tika-parsers;inline=true,
-              commons-compress, xz, commons-codec,
+              commons-compress, xz, commons-codec, junrar,
               pdfbox,fontbox,jempbox,bcmail-jdk15,bcprov-jdk15,
               poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
               xmlbeans,
@@ -171,6 +171,9 @@
               org.apache.commons.httpclient.params;resolution:=optional,
               org.apache.commons.httpclient.protocol;resolution:=optional,
               org.apache.commons.httpclient.util;resolution:=optional,
+              org.apache.commons.vfs2;resolution:=optional,
+              org.apache.commons.vfs2.provider;resolution:=optional,
+              org.apache.commons.vfs2.util;resolution:=optional,
               org.apache.crimson.jaxp;resolution:=optional,
               org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
               org.apache.tools.ant;resolution:=optional,

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Jan 14 11:21:32 2015
@@ -212,6 +212,11 @@
       <artifactId>java-libpst</artifactId>
       <version>0.8.1</version>
     </dependency>
+    <dependency>
+      <groupId>com.github.junrar</groupId>
+      <artifactId>junrar</artifactId>
+      <version>0.7</version>
+    </dependency>
 
     <!-- Test dependencies -->
     <dependency>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1651619&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed Jan 14 11:21:32 2015
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.github.junrar.Archive;
+import com.github.junrar.exception.RarException;
+import com.github.junrar.rarfile.FileHeader;
+
+/**
+ * Parser for Rar files.
+ */
+public class RarParser extends AbstractParser {
+    private static final long serialVersionUID = 6157727985054451501L;
+    
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("x-rar-compressed"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        EmbeddedDocumentExtractor extractor = context.get(
+                EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        TemporaryResources tmp = new TemporaryResources();
+        Archive rar = null;
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            rar = new Archive(tis.getFile());
+
+            if (rar.isEncrypted()) {
+                throw new EncryptedDocumentException();
+            }
+
+            //Without this BodyContentHandler does not work
+            xhtml.element("div", " ");
+
+            FileHeader header = rar.nextFileHeader();
+            while (header != null && !Thread.currentThread().isInterrupted()) {
+
+                if (!header.isDirectory()) {
+
+                    InputStream subFile = null;
+                    try {
+
+                        subFile = rar.getInputStream(header);
+
+                        Metadata entrydata = new Metadata();
+                        entrydata.set(Metadata.RESOURCE_NAME_KEY, header
+                                .getFileNameString().replace("\\", "/"));
+                        entrydata.set(TikaCoreProperties.CREATED,
+                                header.getCTime());
+                        entrydata.set(TikaCoreProperties.MODIFIED,
+                                header.getMTime());
+                        entrydata.set(Metadata.CONTENT_LENGTH,
+                                Long.toString(header.getFullUnpackSize()));
+
+                        if (extractor.shouldParseEmbedded(entrydata))
+                            extractor.parseEmbedded(subFile, handler,
+                                    entrydata, true);
+
+                    } finally {
+                        if (subFile != null)
+                            subFile.close();
+                    }
+                }
+
+                header = rar.nextFileHeader();
+            }
+
+        } catch (RarException e) {
+            throw new TikaException("RarParser Exception", e);
+        } finally {
+            if (rar != null)
+                rar.close();
+            tmp.close();
+        }
+
+        xhtml.endDocument();
+    }
+}
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1651619&r1=1651618&r2=1651619&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 14 11:21:32 2015
@@ -46,6 +46,7 @@ org.apache.tika.parser.odf.OpenDocumentP
 org.apache.tika.parser.pdf.PDFParser
 org.apache.tika.parser.pkg.CompressorParser
 org.apache.tika.parser.pkg.PackageParser
+org.apache.tika.parser.pkg.RarParser
 org.apache.tika.parser.rtf.RTFParser
 org.apache.tika.parser.txt.TXTParser
 org.apache.tika.parser.video.FLVParser

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1651619&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Wed Jan 14 11:21:32 2015
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing rar files.
+ */
+public class RarParserTest extends AbstractPkgTest {
+
+    @Test
+    public void testRarParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = RarParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.rar");
+        try {
+            parser.parse(stream, handler, metadata, recursingContext);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        assertTrue(content.contains("test-documents/testEXCEL.xls"));
+        assertTrue(content.contains("Sample Excel Worksheet"));
+        assertTrue(content.contains("test-documents/testHTML.html"));
+        assertTrue(content.contains("Test Indexation Html"));
+        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+        assertTrue(content.contains("This is a sample Open Office document"));
+        assertTrue(content.contains("test-documents/testPDF.pdf"));
+        assertTrue(content.contains("Apache Tika"));
+        assertTrue(content.contains("test-documents/testPPT.ppt"));
+        assertTrue(content.contains("Sample Powerpoint Slide"));
+        assertTrue(content.contains("test-documents/testRTF.rtf"));
+        assertTrue(content.contains("indexation Word"));
+        assertTrue(content.contains("test-documents/testTXT.txt"));
+        assertTrue(content.contains("Test d'indexation de Txt"));
+        assertTrue(content.contains("test-documents/testWORD.doc"));
+        assertTrue(content.contains("This is a sample Microsoft Word Document"));
+        assertTrue(content.contains("test-documents/testXML.xml"));
+        assertTrue(content.contains("Rida Benjelloun"));
+    }
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = RarParserTest.class.getResourceAsStream(
+               "/test-documents/test-documents.rar");
+       try {
+           parser.parse(stream, handler, metadata, trackingContext);
+       } finally {
+           stream.close();
+       }
+       
+       // Should have found all 9 documents, but not the directory
+       assertEquals(9, tracker.filenames.size());
+       assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
+       
+       // Should have names but not content types, as rar doesn't
+       //  store the content types
+       assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+       assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+       assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+       assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+       assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+       assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+       assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+       assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+       assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+       
+       for(String type : tracker.mediatypes) {
+          assertNull(type);
+       }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
+    }
+}
\ No newline at end of file