You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/28 16:08:50 UTC

tika git commit: TIKA-2096 -- automatically add AutoDetectParser for embedded documents if the user forgets

Repository: tika
Updated Branches:
  refs/heads/master 2df8567ff -> 361ffa40a


TIKA-2096 -- automatically add AutoDetectParser for embedded documents if the user forgets


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/361ffa40
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/361ffa40
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/361ffa40

Branch: refs/heads/master
Commit: 361ffa40a5cee9f37d01f40c2074a18b04c4a6fb
Parents: 2df8567
Author: tballison <ta...@mitre.org>
Authored: Mon Nov 28 11:08:44 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Nov 28 11:08:44 2016 -0500

----------------------------------------------------------------------
 .../tika/extractor/EmbeddedDocumentUtil.java    | 18 ++++++++
 .../src/test/java/org/apache/tika/TikaTest.java |  1 -
 .../extractor/EmbeddedDocumentUtilTest.java     | 43 ++++++++++++++++++++
 .../parser/fork/ForkParserIntegrationTest.java  |  2 +
 .../tika/parser/jdbc/SQLite3ParserTest.java     |  9 ++--
 5 files changed, 68 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 3ceba90..2ff0efe 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -30,7 +30,9 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.ContentHandler;
@@ -58,9 +60,25 @@ public class EmbeddedDocumentUtil implements Serializable {
         this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
     }
 
+    /**
+     * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
+     * As of Tika 1.15, an AutoDetectParser will automatically be added to parse
+     * embedded documents if no Parser.class is specified in the ParseContext.
+     * <p/>
+     * If you'd prefer not to parse embedded documents, set Parser.class
+     * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
+     * @param context
+     * @return EmbeddedDocumentExtractor
+     */
     public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
         EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
         if (extractor == null) {
+            //ensure that an AutoDetectParser is
+            //available for parsing embedded docs TIKA-2096
+            Parser embeddedParser = context.get(Parser.class);
+            if (embeddedParser == null) {
+                context.set(Parser.class, new AutoDetectParser());
+            }
             extractor = new ParsingEmbeddedDocumentExtractor(context);
         }
         return extractor;

http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index aa673f0..6644d86 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -192,7 +192,6 @@ public abstract class TikaTest {
     protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
       if (context == null) {
           context = new ParseContext();
-          context.set(Parser.class, parser);
       }
 
       try {

http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
new file mode 100644
index 0000000..d09cf77
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.extractor;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Integration tests for EmbeddedDocumentUtil
+ */
+public class EmbeddedDocumentUtilTest extends TikaTest {
+
+    @Test
+    public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception {
+        String needle = "When in the Course";
+        //TIKA-2096
+        TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext());
+        assertContains(needle, xmlResult.xml);
+
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, new EmptyParser());
+        xmlResult = getXML("test_recursive_embedded.doc", context);
+        assertNotContained(needle, xmlResult.xml);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index 6a7739c..45605d9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -256,6 +257,7 @@ public class ForkParserIntegrationTest {
             InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
                     "/test-documents/testPDF.pdf");
             ParseContext context = new ParseContext();
+            context.set(Parser.class, new EmptyParser());
             parser.parse(stream, output, new Metadata(), context);
 
             String content = output.toString();

http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
index ca31991..e28921a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.Database;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
@@ -115,17 +116,17 @@ public class SQLite3ParserTest extends TikaTest {
         assertContains("tempor\n", s);
     }
 
-    //test what happens if the user forgets to pass in a parser via context
-    //to handle embedded documents
+    //test what happens if the user does not want embedded docs handled
     @Test
     public void testNotAddingEmbeddedParserToParseContext() throws Exception {
         Parser p = new AutoDetectParser();
         ContentHandler handler = new ToXMLContentHandler();
         Metadata metadata = new Metadata();
-
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Parser.class, new EmptyParser());
         try (InputStream is = getResourceAsStream(TEST_FILE1)) {
             metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
-            p.parse(is, handler, metadata, new ParseContext());
+            p.parse(is, handler, metadata, parseContext);
         }
         String xml = handler.toString();
         //just includes headers for embedded documents