You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/28 16:08:50 UTC
tika git commit: TIKA-2096 -- automatically add AutoDetectParser for
embedded documents if the user forgets
Repository: tika
Updated Branches:
refs/heads/master 2df8567ff -> 361ffa40a
TIKA-2096 -- automatically add AutoDetectParser for embedded documents if the user forgets
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/361ffa40
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/361ffa40
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/361ffa40
Branch: refs/heads/master
Commit: 361ffa40a5cee9f37d01f40c2074a18b04c4a6fb
Parents: 2df8567
Author: tballison <ta...@mitre.org>
Authored: Mon Nov 28 11:08:44 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Nov 28 11:08:44 2016 -0500
----------------------------------------------------------------------
.../tika/extractor/EmbeddedDocumentUtil.java | 18 ++++++++
.../src/test/java/org/apache/tika/TikaTest.java | 1 -
.../extractor/EmbeddedDocumentUtilTest.java | 43 ++++++++++++++++++++
.../parser/fork/ForkParserIntegrationTest.java | 2 +
.../tika/parser/jdbc/SQLite3ParserTest.java | 9 ++--
5 files changed, 68 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 3ceba90..2ff0efe 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -30,7 +30,9 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.ContentHandler;
@@ -58,9 +60,25 @@ public class EmbeddedDocumentUtil implements Serializable {
this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
}
+ /**
+ * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
+ * As of Tika 1.15, an AutoDetectParser will automatically be added to parse
+ * embedded documents if no Parser.class is specified in the ParseContext.
+ * <p/>
+ * If you'd prefer not to parse embedded documents, set Parser.class
+ * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
+ * @param context
+ * @return EmbeddedDocumentExtractor
+ */
public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
if (extractor == null) {
+ //ensure that an AutoDetectParser is
+ //available for parsing embedded docs TIKA-2096
+ Parser embeddedParser = context.get(Parser.class);
+ if (embeddedParser == null) {
+ context.set(Parser.class, new AutoDetectParser());
+ }
extractor = new ParsingEmbeddedDocumentExtractor(context);
}
return extractor;
http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index aa673f0..6644d86 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -192,7 +192,6 @@ public abstract class TikaTest {
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
if (context == null) {
context = new ParseContext();
- context.set(Parser.class, parser);
}
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
new file mode 100644
index 0000000..d09cf77
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.extractor;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Integration tests for EmbeddedDocumentUtil
+ */
+public class EmbeddedDocumentUtilTest extends TikaTest {
+
+ @Test
+ public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception {
+ String needle = "When in the Course";
+ //TIKA-2096
+ TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext());
+ assertContains(needle, xmlResult.xml);
+
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new EmptyParser());
+ xmlResult = getXML("test_recursive_embedded.doc", context);
+ assertNotContained(needle, xmlResult.xml);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index 6a7739c..45605d9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -256,6 +257,7 @@ public class ForkParserIntegrationTest {
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
"/test-documents/testPDF.pdf");
ParseContext context = new ParseContext();
+ context.set(Parser.class, new EmptyParser());
parser.parse(stream, output, new Metadata(), context);
String content = output.toString();
http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
index ca31991..e28921a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
@@ -115,17 +116,17 @@ public class SQLite3ParserTest extends TikaTest {
assertContains("tempor\n", s);
}
- //test what happens if the user forgets to pass in a parser via context
- //to handle embedded documents
+ //test what happens if the user does not want embedded docs handled
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
-
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(Parser.class, new EmptyParser());
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
- p.parse(is, handler, metadata, new ParseContext());
+ p.parse(is, handler, metadata, parseContext);
}
String xml = handler.toString();
//just includes headers for embedded documents