You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/24 21:43:57 UTC

[tika] branch main updated: TIKA-3800 -- add UnrarParser as an optional parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 94c8f03df TIKA-3800 -- add UnrarParser as an optional parser
94c8f03df is described below

commit 94c8f03dfac05f5bc0eaacb3c7e0c3f8571891bf
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jun 24 17:43:46 2022 -0400

    TIKA-3800 -- add UnrarParser as an optional parser
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/parser/pkg/UnrarParser.java    | 156 +++++++++++++++++++++
 .../apache/tika/parser/pkg/UnrarParserTest.java    |  59 ++++++++
 .../apache/tika/parser/pkg/tika-unrar-config.xml   |  26 ++++
 .../apache/tika/parser/pkg/UnrarParserTest.java    |  60 ++++++++
 5 files changed, 303 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 11f170b1b..a313fc4b4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.4.2 - ???
 
+   * Add unrar as an optional parser (TIKA-3800).
+
    * Refactor FuzzingCLI to use PipesParser (TIKA-3799).
 
    * ServiceLoader's loadServiceProviders() now guarantees
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
new file mode 100644
index 000000000..a59563a49
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaTimeoutException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Parser for Rar files.  This relies on 'unrar' being installed
+ * and on the path.  This is not the default rar parser and must
+ * be selected via the tika-config.xml.
+ */
+public class UnrarParser extends AbstractParser {
+    private static final long serialVersionUID = 6157727985054451501L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-rar-compressed"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+        return SUPPORTED_TYPES;
+    }
+    private long timeoutMillis = 60000;
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        EmbeddedDocumentExtractor extractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+        Path cwd = Files.createTempDirectory("tika-unrar-");
+        try {
+            Path tmp = Files.createTempFile(cwd, "input", ".rar");
+            try (OutputStream os = Files.newOutputStream(tmp, StandardOpenOption.WRITE)) {
+                IOUtils.copy(stream, os);
+            }
+            FileProcessResult result = unrar(cwd, tmp);
+            //delete the tmp rar file so that we don't recursively parse it in the next step
+            try {
+                Files.delete(tmp);
+            } catch (IOException e) {
+                //warn failed to delete tmp
+            }
+            if (result.isTimeout()) {
+                throw new TikaTimeoutException("timed out unrarring");
+            } else if (result.getExitValue() != 0) {
+                if (result.getStderr().contains("error in the encrypted file")) {
+                    throw new EncryptedDocumentException();
+                }
+                String msg = result.getStderr();
+                if (msg.length() > 100) {
+                    msg = msg.substring(0, 100);
+                }
+                throw new TikaException("Unrecoverable problem with rar file, exitValue=" +
+                        result.getExitValue() + " : " + msg);
+            }
+            //TODO: process stdout to extract status for each file:
+            //e.g. Extracting  test-documents/testEXCEL.xls                              OK
+            processDirectory(cwd, cwd, xhtml, extractor, context);
+        } finally {
+            FileUtils.deleteDirectory(cwd.toFile());
+        }
+        xhtml.endDocument();
+    }
+
+    private void processDirectory(Path baseDir, Path path,
+                               XHTMLContentHandler xhtml,
+                               EmbeddedDocumentExtractor extractor, ParseContext context)
+            throws IOException, SAXException {
+        for (File f : path.toFile().listFiles()) {
+            if (f.isDirectory()) {
+                processDirectory(baseDir, f.toPath(), xhtml, extractor,
+                        context);
+            } else {
+                processFile(baseDir, f.toPath(), xhtml, extractor, context);
+            }
+        }
+    }
+
+    private void processFile(Path base, Path embeddedFile,
+                             XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor, ParseContext context)
+            throws IOException, SAXException {
+        String relPath = base.relativize(embeddedFile).toString();
+        Metadata metadata = new Metadata();
+        String fName = FilenameUtils.getName(relPath);
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fName);
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, relPath);
+        if (extractor.shouldParseEmbedded(metadata)) {
+            try (InputStream is = TikaInputStream.get(embeddedFile)) {
+                extractor.parseEmbedded(is, xhtml, metadata, true);
+            }
+        }
+    }
+
+    private FileProcessResult unrar(Path cwd, Path tmp) throws IOException {
+        //we could use the -l option to check for potentially bad file names
+        //e.g. path traversals
+        ProcessBuilder pb = new ProcessBuilder();
+        pb.directory(cwd.toFile());
+        pb.command(
+                "unrar",
+                "x",  //extract with paths...hope that unrar protects against path traversals
+                "-kb", // keep broken files
+                "-p-", // we don't support passwords yet -- don't hang waiting for password on stdin
+                ProcessUtils.escapeCommandLine(tmp.toAbsolutePath().toString())
+
+        );
+        return ProcessUtils.execute(pb, timeoutMillis, 10000, 1000);
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
new file mode 100644
index 000000000..414450d70
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.io.InputStream;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+
+/**
+ * Test case for parsing unrar files.
+ */
+public class UnrarParserTest extends AbstractPkgTest {
+
+    /**
+     * Note - we don't currently support Encrypted RAR files,
+     * so all we can do is throw a helpful exception
+     */
+    @Test
+    public void testEncryptedRar() throws Exception {
+        assumeTrue(ExternalParser.check("unrar"));
+        Parser parser = new UnrarParser();
+
+        try (InputStream input = getResourceAsStream("/test-documents/test-documents-enc.rar")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+
+            // Note - we don't currently support encrypted RAR
+            // files so we can't check the contents
+            parser.parse(input, handler, metadata, trackingContext);
+            fail("No support yet for Encrypted RAR files");
+        } catch (EncryptedDocumentException e) {
+            // Good, as expected right now
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-unrar-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-unrar-config.xml
new file mode 100644
index 000000000..d25a151a7
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-unrar-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pkg.RarParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pkg.UnrarParser">
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
new file mode 100644
index 000000000..36f6b87f8
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+
+
+/**
+ * Test case for parsing rar files.
+ */
+public class UnrarParserTest extends AbstractPkgTest {
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     * fired for all the embedded entries.
+     */
+    @Test
+    public void testEmbedded() throws Exception {
+        assumeTrue(ExternalParser.check("unrar"));
+        TikaConfig tikaConfig = null;
+        try (InputStream is = getResourceAsStream("tika-unrar-config.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("test-documents.rar", p);
+        assertEquals("org.apache.tika.parser.pkg.UnrarParser",
+                metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
+        assertEquals(12, metadataList.size());
+        assertEquals("test-documents/testRTF.rtf",
+                metadataList.get(11).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+    }
+
+}