You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/22 02:19:22 UTC
[10/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
new file mode 100644
index 0000000..0ed428e
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -0,0 +1,312 @@
+package org.apache.tika.parser;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.digesting.CommonsDigester;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RecursiveParserWrapperTest {
+
+ @Test
+ public void testBasicXML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+ }
+
+ @Test
+ public void testBasicHTML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
+ }
+
+ @Test
+ public void testBasicText() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p ") < 0);
+ assertTrue(content.indexOf("embed_0") > -1);
+ }
+
+ @Test
+ public void testIgnoreContent() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertNull(content);
+ }
+
+
+ @Test
+ public void testCharLimit() throws Exception {
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+
+ assertEquals(5, list.size());
+
+ int wlr = 0;
+ for (Metadata m : list) {
+ String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
+ if (limitReached != null && limitReached.equals("true")) {
+ wlr++;
+ }
+ }
+ assertEquals(1, wlr);
+
+ }
+
+ @Test
+ public void testMaxEmbedded() throws Exception {
+ int maxEmbedded = 4;
+ int totalNoLimit = 12;//including outer container file
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+ String limitReached = null;
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+ //test default
+ assertEquals(totalNoLimit, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.setMaxEmbeddedResources(maxEmbedded);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ list = wrapper.getMetadata();
+
+ //add 1 for outer container file
+ assertEquals(maxEmbedded + 1, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertEquals("true", limitReached);
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value < 0
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+
+ wrapper.setMaxEmbeddedResources(-2);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ assertEquals(totalNoLimit, list.size());
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+ }
+
+ @Test
+ public void testEmbeddedResourcePath() throws Exception {
+
+ Set<String> targets = new HashSet<String>();
+ targets.add("/embed1.zip");
+ targets.add("/embed1.zip/embed2.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
+ targets.add("/embed1.zip/embed2.zip/embed2a.txt");
+ targets.add("/embed1.zip/embed2.zip/embed2b.txt");
+ targets.add("/embed1.zip/embed1b.txt");
+ targets.add("/embed1.zip/embed1a.txt");
+ targets.add("/image1.emf");
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+
+ Set<String> seen = new HashSet<String>();
+ for (Metadata m : list) {
+ String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+ if (path != null) {
+ seen.add(path);
+ }
+ }
+ assertEquals(targets, seen);
+ }
+
+ @Test
+ public void testEmbeddedNPE() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ //default behavior (user doesn't specify whether or not to catch embedded exceptions
+ //is to catch the exception
+ assertEquals(13, list.size());
+ Metadata mockNPEMetadata = list.get(10);
+ assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
+ list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ false, null);
+
+ //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
+ //and just doesn't bother to report that there was an exception.
+ assertEquals(12, list.size());
+ }
+
+ @Test
+ public void testPrimaryExcWEmbedded() throws Exception {
+ //if embedded content is handled and then
+ //the parser hits an exception in the container document,
+ //that the first element of the returned list is the container document
+ //and the second is the embedded content
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
+
+ ParseContext context = new ParseContext();
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
+ String path = "/test-documents/mock/embedded_then_npe.xml";
+
+ InputStream stream = null;
+ boolean npe = false;
+ try {
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ path);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } catch (TikaException e) {
+ if (e.getCause().getClass().equals(NullPointerException.class)) {
+ npe = true;
+ }
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ assertTrue("npe", npe);
+
+ List<Metadata> metadataList = wrapper.getMetadata();
+ assertEquals(2, metadataList.size());
+ Metadata outerMetadata = metadataList.get(0);
+ Metadata embeddedMetadata = metadataList.get(1);
+ assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+ assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
+
+ assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+ assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
+ }
+
+ @Test
+ public void testDigesters() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
+ int i = 0;
+ Metadata m0 = list.get(0);
+ Metadata m6 = list.get(6);
+ String md5Key = "X-TIKA:digest:MD5";
+ assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
+ assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
+ assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
+ boolean catchEmbeddedExceptions,
+ DigestingParser.Digester digester) throws Exception {
+ ParseContext context = new ParseContext();
+ Parser wrapped = new AutoDetectParser();
+ if (digester != null) {
+ wrapped = new DigestingParser(wrapped, digester);
+ }
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ contentHandlerFactory, catchEmbeddedExceptions);
+ String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (path == null) {
+ path = "/test-documents/test_recursive_embedded.docx";
+ } else {
+ path = "/test-documents/" + path;
+ }
+ InputStream stream = null;
+ try {
+ stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return wrapper.getMetadata();
+
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory)
+ throws Exception {
+ return getMetadata(metadata, contentHandlerFactory, true, null);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
new file mode 100644
index 0000000..cde3e78
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Junit test class for Tika {@link Parser}s.
+ */
+public class TestParsers extends TikaTest {
+
+ private TikaConfig tc;
+
+ private Tika tika;
+
+ @Before
+ public void setUp() throws Exception {
+ tc = TikaConfig.getDefaultConfig();
+ tika = new Tika(tc);
+ }
+
+ @Test
+ public void testWORDExtraction() throws Exception {
+
+ Path tmpFile = getTestDocumentAsTempFile("testWORD.doc");
+ Parser parser = tika.getParser();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = Files.newInputStream(tmpFile)) {
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+ } finally {
+ Files.delete(tmpFile);
+ }
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ }
+
+ @Test
+ public void testEXCELExtraction() throws Exception {
+ final String expected = "Numbers and their Squares";
+ Path tmpFile = getTestDocumentAsTempFile("testEXCEL.xls");
+ try {
+ String s1 = tika.parseToString(tmpFile);
+ assertTrue("Text does not contain '" + expected + "'", s1
+ .contains(expected));
+ Parser parser = tika.getParser();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = Files.newInputStream(tmpFile)) {
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+ }
+ assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
+ } finally {
+ Files.delete(tmpFile);
+ }
+ }
+
+ @Test
+ public void testOptionalHyphen() throws Exception {
+ String[] extensions =
+ new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"};
+ for (String extension : extensions) {
+ Path tmpFile = getTestDocumentAsTempFile("testOptionalHyphen." + extension);
+ String content = null;
+ try {
+ content = tika.parseToString(tmpFile);
+ } finally {
+ Files.delete(tmpFile);
+ }
+ assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content,
+ content.contains("optionalhyphen") ||
+ content.contains("optional\u00adhyphen") || // soft hyphen
+ content.contains("optional\u200bhyphen") || // zero width space
+ content.contains("optional\u2027")); // hyphenation point
+
+ }
+ }
+
+ @Test
+ public void testComment() throws Exception {
+ final String[] extensions = new String[] {"ppt", "pptx", "doc",
+ "docx", "xls", "xlsx", "pdf", "rtf"};
+ for(String extension : extensions) {
+ verifyComment(extension, "testComment");
+ }
+ }
+
+ private void verifyComment(String extension, String fileName) throws Exception {
+ TemporaryResources tmp = new TemporaryResources();
+
+ String content = null;
+ Path tmpFile = null;
+ try {
+ tmpFile = getTestDocumentAsTempFile(fileName + "." + extension);
+ content = tika.parseToString(tmpFile);
+ } finally {
+ if (tmpFile != null) {
+ Files.delete(tmpFile);
+ }
+ }
+ assertTrue(extension + ": content=" + content + " did not extract text",
+ content.contains("Here is some text"));
+ assertTrue(extension + ": content=" + content + " did not extract comment",
+ content.contains("Here is a comment"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
new file mode 100644
index 0000000..54c1427
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.fork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.NotSerializableException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.Tika;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fork.ForkParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Test that the ForkParser correctly behaves when
+ * wired in to the regular Parsers and their test data
+ */
+public class ForkParserIntegrationTest {
+
+ private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works
+
+ /**
+ * Simple text parsing
+ */
+ @Test
+ public void testForkedTextParsing() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+
+ String content = output.toString();
+ assertContains("Test d'indexation", content);
+ assertContains("http://www.apache.org", content);
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * This error has a message and an equals() implementation as to be able
+ * to match it against the serialized version of itself.
+ */
+ static class AnError extends Error {
+ private static final long serialVersionUID = -6197267350768803348L;
+ private String message;
+ AnError(String message) {
+ super(message);
+ this.message = message;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ AnError anError = (AnError) o;
+
+ if (!message.equals(anError.message)) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return message.hashCode();
+ }
+ }
+
+ /**
+ * This error isn't serializable on the server, so can't be sent back
+ * to the Fork Client once it has occured
+ */
+ static class WontBeSerializedError extends RuntimeException {
+ private static final long serialVersionUID = 1L;
+
+ WontBeSerializedError(String message) {
+ super(message);
+ }
+
+ private void writeObject(java.io.ObjectOutputStream out) {
+ RuntimeException e = new RuntimeException("Bang!");
+ boolean found = false;
+ for (StackTraceElement ste : e.getStackTrace()) {
+ if (ste.getClassName().equals(ForkParser.class.getName())) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ throw e;
+ }
+ }
+ }
+
+ static class BrokenParser implements Parser {
+ private static final long serialVersionUID = 995871497930817839L;
+ public Error err = new AnError("Simulated fail");
+ public RuntimeException re = null;
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN));
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ if (re != null) throw re;
+ throw err;
+ }
+ }
+
+ /**
+ * TIKA-831 Parsers throwing errors should be caught and
+ * properly reported
+ */
+ @Test
+ public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
+ BrokenParser brokenParser = new BrokenParser();
+ Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
+
+ // With a serializable error, we'll get that back
+ try {
+ ContentHandler output = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+ fail("Expected TikaException caused by Error");
+ } catch (TikaException e) {
+ assertEquals(brokenParser.err, e.getCause());
+ }
+
+ // With a non serializable one, we'll get something else
+ // TODO Fix this test
+ brokenParser = new BrokenParser();
+ brokenParser.re= new WontBeSerializedError("Can't Serialize");
+ parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
+// try {
+// ContentHandler output = new BodyContentHandler();
+// ParseContext context = new ParseContext();
+// parser.parse(stream, output, new Metadata(), context);
+// fail("Expected TikaException caused by Error");
+// } catch (TikaException e) {
+// assertEquals(TikaException.class, e.getCause().getClass());
+// assertEquals("Bang!", e.getCause().getMessage());
+// }
+ }
+
+ /**
+ * If we supply a non serializable object on the ParseContext,
+ * check we get a helpful exception back
+ */
+ @Test
+ public void testParserHandlingOfNonSerializable() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+
+ ParseContext context = new ParseContext();
+ context.set(Detector.class, new Detector() {
+ public MediaType detect(InputStream input, Metadata metadata) {
+ return MediaType.OCTET_STREAM;
+ }
+ });
+
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ parser.parse(stream, output, new Metadata(), context);
+ fail("Should have blown up with a non serializable ParseContext");
+ } catch(TikaException e) {
+ // Check the right details
+ assertNotNull(e.getCause());
+ assertEquals(NotSerializableException.class, e.getCause().getClass());
+ assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * TIKA-832
+ */
+ @Test
+ public void testAttachingADebuggerOnTheForkedParserShouldWork()
+ throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, tika.getParser());
+
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+ parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug",
+ "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
+ try {
+ ContentHandler body = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ parser.parse(stream, body, new Metadata(), context);
+ String content = body.toString();
+ assertContains("Test d'indexation", content);
+ assertContains("http://www.apache.org", content);
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * TIKA-808 - Ensure that parsing of our test PDFs work under
+ * the Fork Parser, to ensure that complex parsing behaves
+ */
+ @Test
+ public void testForkedPDFParsing() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testPDF.pdf");
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+
+ String content = output.toString();
+ assertContains("Apache Tika", content);
+ assertContains("Tika - Content Analysis Toolkit", content);
+ assertContains("incubator", content);
+ assertContains("Apache Software Foundation", content);
+ } finally {
+ parser.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
new file mode 100644
index 0000000..52af12b
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
@@ -0,0 +1,251 @@
+package org.apache.tika.parser.mock;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Date;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources
+ * or else it will be called by every module that uses it. Um, Yossarian!!!
+ */
+public class MockParserTest extends TikaTest {
+ private final static String M = "/test-documents/mock/";
+ private final static Parser PARSER = new AutoDetectParser();
+
+ @Override
+ public XMLResult getXML(String path, Metadata m) throws Exception {
+ //note that this is specific to MockParserTest with addition of M to the path!
+ InputStream is = getResourceAsStream(M+path);
+ try {
+ return super.getXML(is, PARSER, m);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ @Test
+ public void testExample() throws Exception {
+ Metadata m = new Metadata();
+ PrintStream out = System.out;
+ PrintStream err = System.err;
+ ByteArrayOutputStream outBos = new ByteArrayOutputStream();
+ ByteArrayOutputStream errBos = new ByteArrayOutputStream();
+ PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString());
+ PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString());
+ System.setOut(tmpOut);
+ System.setErr(tmpErr);
+ try {
+ assertThrowable("example.xml", m, IOException.class, "not another IOException");
+ assertMockParser(m);
+ } finally {
+ System.setOut(out);
+ System.setErr(err);
+ }
+ String outString = new String(outBos.toByteArray(), UTF_8);
+ assertContains("writing to System.out", outString);
+
+ String errString = new String(errBos.toByteArray(), UTF_8);
+ assertContains("writing to System.err", errString);
+
+ }
+
+ @Test
+ public void testNothingBad() throws Exception {
+ Metadata m = new Metadata();
+ String content = getXML("nothing_bad.xml", m).xml;
+ assertEquals("Geoffrey Chaucer", m.get("author"));
+ assertContains("<p>And bathed every veyne in swich licour,</p>", content);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointer() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer.xml", m, NullPointerException.class, "null pointer message");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointerNoMsg() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null);
+ assertMockParser(m);
+ }
+
+
+ @Test
+ public void testSleep() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+ String content = getXML("sleep.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testHeavyHang() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+
+ String content = getXML("heavy_hang.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testFakeOOM() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testRealOOM() throws Exception {
+ //Note: we're not actually testing the diff between fake and real oom
+ //i.e. by creating child process and setting different -Xmx or
+ //memory profiling.
+ Metadata m = new Metadata();
+ assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testInterruptibleSleep() {
+ //Without static initialization of the parser, it can take ~1 second after t.start()
+ //before the parser actually calls parse. This is
+ //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc.
+ //This is not thread creation overhead.
+ ParserRunnable r = new ParserRunnable("sleep_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+
+ t.interrupt();
+
+ try {
+ t.join(10000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean shortEnough = elapsed < 2000;//the xml file specifies 3000
+ assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough);
+ }
+
+ @Test
+ public void testNonInterruptibleSleep() {
+ ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ //make sure that the thread has actually started
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ t.interrupt();
+ try {
+ t.join(20000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
+ assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
+ }
+
+ private class ParserRunnable implements Runnable {
+ private final String path;
+ ParserRunnable(String path) {
+ this.path = path;
+ }
+ @Override
+ public void run() {
+ Metadata m = new Metadata();
+ try {
+ getXML(path, m);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ assertMockParser(m);
+ }
+ }
+ }
+
+ private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) {
+
+ try {
+ getXML(path, m);
+ } catch (Throwable t) {
+ //if this is a throwable wrapped in a TikaException, use the cause
+ if (t instanceof TikaException && t.getCause() != null) {
+ t = t.getCause();
+ }
+ if (! (t.getClass().isAssignableFrom(expected))){
+ fail(t.getClass() +" is not assignable from "+expected);
+ }
+ if (message != null) {
+ assertEquals(message, t.getMessage());
+ }
+ }
+ }
+
+ private void assertMockParser(Metadata m) {
+ String[] parsers = m.getValues("X-Parsed-By");
+ //make sure that it was actually parsed by mock.
+ boolean parsedByMock = false;
+ for (String parser : parsers) {
+ if (parser.equals("org.apache.tika.parser.mock.MockParser")) {
+ parsedByMock = true;
+ break;
+ }
+ }
+ assertTrue("mock parser should have been called", parsedByMock);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
new file mode 100644
index 0000000..c47a348
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
@@ -0,0 +1,335 @@
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PackageTest extends TikaTest {
+
+ private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+
+ private ParseContext recursingContext;
+ private Parser autoDetectParser;
+
+ @Before
+ public void setUp() throws Exception {
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+ @Test
+ public void testZlibParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testTXT.zlib")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+ }
+
+
+ @Test
+ public void testArParsing() throws Exception {
+ Parser parser = new AutoDetectParser();
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ content = handler.toString();
+ assertContains("testAU.au", content);
+ }
+
+ @Test
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testCompressParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar.Z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testRarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void test7ZParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Ensure 7zip is a parsable format
+ assertTrue("No 7zip parser found",
+ parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+
+ // Parse
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+ @Test
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testSvgzParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testSVG.svgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test SVG image", content);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
new file mode 100644
index 0000000..eff076b
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.sax;
+
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Test class for the {@link PhoneExtractingContentHandler}
+ * class. This demonstrates how to parse a document and retrieve any phone numbers
+ * found within.
+ *
+ * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers".
+ * You can get an array of phone numbers by calling metadata.getValues("phonenumber").
+ */
+public class PhoneExtractingContentHandlerTest {
+ @Test
+ public void testExtractPhoneNumbers() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
+ // to the underlying Handler.
+ PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+ try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ String[] phoneNumbers = metadata.getValues("phonenumbers");
+ assertContains("9498888888", phoneNumbers[0]);
+ assertContains("9497777777", phoneNumbers[1]);
+ assertContains("9496666666", phoneNumbers[2]);
+ assertContains("9495555555", phoneNumbers[3]);
+ assertContains("4193404645", phoneNumbers[4]);
+ assertContains("9044687081", phoneNumbers[5]);
+ assertContains("2604094811", phoneNumbers[6]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
new file mode 100644
index 0000000..62660c8
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.utils;
+
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+public class ServiceLoaderUtilsTest extends TikaTest {
+ @Test
+ public void testOrdering() throws Exception {
+ //make sure that non Tika parsers come last
+ //which means that they'll overwrite Tika parsers and
+ //be preferred.
+ DefaultParser defaultParser = new DefaultParser();
+ int vorbisIndex = -1;
+ int fictIndex = -1;
+ int dcxmlIndex = -1;
+ int i = 0;
+ for (Parser p : defaultParser.getAllComponentParsers()) {
+ if ("class org.gagravarr.tika.VorbisParser".equals(p.getClass().toString())) {
+ vorbisIndex = i;
+ }
+ if ("class org.apache.tika.parser.xml.FictionBookParser".equals(p.getClass().toString())) {
+ fictIndex = i;
+ }
+ if ("class org.apache.tika.parser.xml.DcXMLParser".equals(p.getClass().toString())) {
+ dcxmlIndex = i;
+ }
+ i++;
+ }
+
+ assertNotEquals(vorbisIndex, fictIndex);
+ assertNotEquals(fictIndex, dcxmlIndex);
+ assertTrue(vorbisIndex > fictIndex);
+ assertTrue(fictIndex > dcxmlIndex);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/pom.xml
----------------------------------------------------------------------
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index e63f101..2c61616 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -33,8 +33,17 @@
<packaging>bundle</packaging>
<name>Apache Tika core</name>
<url>http://tika.apache.org/</url>
+ <properties>
+ <!-- NOTE: sync codec version with POI -->
+ <codec.version>1.10</codec.version>
+ </properties>
<dependencies>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
<groupId>org.osgi</groupId>
@@ -60,6 +69,13 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>org.ops4j.pax.exam</groupId>
<artifactId>pax-exam-junit4</artifactId>
@@ -108,6 +124,9 @@
<Bundle-DocURL>${project.url}</Bundle-DocURL>
<Bundle-Activator>org.apache.tika.config.TikaActivator</Bundle-Activator>
<Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
+ <Embed-Dependency>
+ commons-codec
+ </Embed-Dependency>
</instructions>
</configuration>
</plugin>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
new file mode 100644
index 0000000..e7b2405
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
@@ -0,0 +1,295 @@
+package org.apache.tika.parser.digesting;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
+ * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
+ * <p>
+ * This digester tries to use the regular mark/reset protocol on the InputStream.
+ * However, this wraps an internal BoundedInputStream, and if the InputStream
+ * is not fully read, then this will reset the stream and
+ * spool the InputStream to disk (via TikaInputStream) and then digest the file.
+ * <p>
+ * If a TikaInputStream is passed in and it has an underlying file that is longer
+ * than the {@link #markLimit}, then this digester digests the file directly.
+ *
+ */
+public class CommonsDigester implements DigestingParser.Digester {
+
+ public enum DigestAlgorithm {
+ //those currently available in commons.digest
+ MD2,
+ MD5,
+ SHA1,
+ SHA256,
+ SHA384,
+ SHA512;
+
+ String getMetadataKey() {
+ return TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+ }
+ }
+
+ private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>();
+ private final int markLimit;
+
+ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+ Collections.addAll(this.algorithms, algorithms);
+ if (markLimit < 0) {
+ throw new IllegalArgumentException("markLimit must be >= 0");
+ }
+ this.markLimit = markLimit;
+ }
+
+ @Override
+ public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
+ InputStream tis = TikaInputStream.get(is);
+ long sz = -1;
+ if (((TikaInputStream)tis).hasFile()) {
+ sz = ((TikaInputStream)tis).getLength();
+ }
+ //if the file is definitely a file,
+ //and its size is greater than its mark limit,
+ //just digest the underlying file.
+ if (sz > markLimit) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ return;
+ }
+
+ //try the usual mark/reset stuff.
+ //however, if you actually hit the bound,
+ //then stop and spool to file via TikaInputStream
+ SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
+ boolean finishedStream = false;
+ for (DigestAlgorithm algorithm : algorithms) {
+ bis.mark(markLimit + 1);
+ finishedStream = digestEach(algorithm, bis, m);
+ bis.reset();
+ if (!finishedStream) {
+ break;
+ }
+ }
+ if (!finishedStream) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ }
+ }
+
+ private void digestFile(File f, Metadata m) throws IOException {
+ for (DigestAlgorithm algorithm : algorithms) {
+ try (InputStream is = new FileInputStream(f)) {
+ digestEach(algorithm, is, m);
+ }
+ }
+ }
+
+ /**
+ *
+ * @param algorithm algo to use
+ * @param is input stream to read from
+ * @param metadata metadata for reporting the digest
+ * @return whether or not this finished the input stream
+ * @throws IOException
+ */
+ private boolean digestEach(DigestAlgorithm algorithm,
+ InputStream is, Metadata metadata) throws IOException {
+ String digest = null;
+ try {
+ switch (algorithm) {
+ case MD2:
+ digest = DigestUtils.md2Hex(is);
+ break;
+ case MD5:
+ digest = DigestUtils.md5Hex(is);
+ break;
+ case SHA1:
+ digest = DigestUtils.sha1Hex(is);
+ break;
+ case SHA256:
+ digest = DigestUtils.sha256Hex(is);
+ break;
+ case SHA384:
+ digest = DigestUtils.sha384Hex(is);
+ break;
+ case SHA512:
+ digest = DigestUtils.sha512Hex(is);
+ break;
+ default:
+ throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ //swallow, or should we throw this?
+ }
+ if (is instanceof SimpleBoundedInputStream) {
+ if (((SimpleBoundedInputStream)is).hasHitBound()) {
+ return false;
+ }
+ }
+ metadata.set(algorithm.getMetadataKey(), digest);
+ return true;
+ }
+
+ /**
+ *
+ * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
+ * @return
+ */
+ public static DigestAlgorithm[] parse(String s) {
+ assert(s != null);
+
+ List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+ for (String algoString : s.split(",")) {
+ String uc = algoString.toUpperCase(Locale.ROOT);
+ if (uc.equals(DigestAlgorithm.MD2.toString())) {
+ ret.add(DigestAlgorithm.MD2);
+ } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+ ret.add(DigestAlgorithm.MD5);
+ } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+ ret.add(DigestAlgorithm.SHA1);
+ } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+ ret.add(DigestAlgorithm.SHA256);
+ } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+ ret.add(DigestAlgorithm.SHA384);
+ } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+ ret.add(DigestAlgorithm.SHA512);
+ } else {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(algo.toString());
+ }
+ throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString());
+ }
+ }
+ return ret.toArray(new DigestAlgorithm[ret.size()]);
+ }
+
+ /**
+ * Very slight modification of Commons' BoundedInputStream
+ * so that we can figure out if this hit the bound or not.
+ */
+ private class SimpleBoundedInputStream extends InputStream {
+ private final static int EOF = -1;
+ private final long max;
+ private final InputStream in;
+ private long pos;
+ boolean hitBound = false;
+
+ private SimpleBoundedInputStream(long max, InputStream in) {
+ this.max = max;
+ this.in = in;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (max >= 0 && pos >= max) {
+ hitBound = true;
+ return EOF;
+ }
+ final int result = in.read();
+ pos++;
+ return result;
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[])</code> method.
+ * @param b the buffer to read the bytes into
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+ * @param b the buffer to read the bytes into
+ * @param off The start offset
+ * @param len The number of bytes to read
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b, final int off, final int len) throws IOException {
+ if (max>=0 && pos>=max) {
+ return EOF;
+ }
+ final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
+ final int bytesRead = in.read(b, off, (int)maxRead);
+
+ if (bytesRead==EOF) {
+ return EOF;
+ }
+
+ pos+=bytesRead;
+ return bytesRead;
+ }
+
+ /**
+ * Invokes the delegate's <code>skip(long)</code> method.
+ * @param n the number of bytes to skip
+ * @return the actual number of bytes skipped
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public long skip(final long n) throws IOException {
+ final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+ final long skippedBytes = in.skip(toSkip);
+ pos+=skippedBytes;
+ return skippedBytes;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ in.reset();
+ }
+
+ @Override
+ public void mark(int readLimit) {
+ in.mark(readLimit);
+ }
+
+ public boolean hasHitBound() {
+ return hitBound;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 2c6f21f..1edf91c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -26,6 +26,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -74,6 +77,25 @@ public abstract class TikaTest {
}
}
+
+ /**
+ * Copies test file from "test-documents" to a temp file.
+ * Consumers are responsible for deleting the temp file after use.
+ *
+ * @param name
+ * @return
+ * @throws IOException
+ */
+ public Path getTestDocumentAsTempFile(String name) throws IOException{
+ Path tmp = Files.createTempFile("tika-test", "");
+ Files.copy(getResourceAsStream("/test-documents/"+name), tmp, StandardCopyOption.REPLACE_EXISTING);
+ return tmp;
+ }
+
+ public InputStream getTestDocumentAsStream(String name) {
+ return TikaInputStream.get(getResourceAsStream("/test-documents/"+name));
+ }
+
public InputStream getResourceAsStream(String name) {
InputStream stream = this.getClass().getResourceAsStream(name);
if (stream == null) {
@@ -106,36 +128,50 @@ public abstract class TikaTest {
}
}
+ protected XMLResult getXML(String filePath, Parser parser, Metadata metadata, ParseContext context) throws Exception {
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata, context);
+ }
+
protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata);
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata);
}
protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata);
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata, context);
+ }
+
+ protected XMLResult getXML(String filePath, Parser parser) throws Exception {
+ //send in empty parse context so that only outer parser is used
+ return getXML(getTestDocumentAsStream(filePath), parser, new Metadata(), new ParseContext());
}
protected XMLResult getXML(String filePath) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
+ return getXML(filePath, new Metadata());
}
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
-
- try {
- ContentHandler handler = new ToXMLContentHandler();
- parser.parse(input, handler, metadata, context);
- return new XMLResult(handler.toString(), metadata);
- } finally {
- input.close();
- }
- }
+ return getXML(input, parser, metadata, new ParseContext());
+ }
- /**
- * Basic text extraction.
- * <p>
- * Tries to close input stream after processing.
- */
+ protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
+ try {
+ ContentHandler handler = new ToXMLContentHandler();
+ parser.parse(input, handler, metadata, context);
+ return new XMLResult(handler.toString(), metadata);
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Basic text extraction.
+ * <p>
+ * Tries to close input stream after processing.
+ */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
ContentHandler handler = new BodyContentHandler(1000000);
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
index c815607..d2f3b40 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
@@ -22,13 +22,13 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeDetectionTest;
import org.junit.Before;
import org.junit.Test;
-public class MimeDetectionWithNNTest {
+public class MimeDetectionWithNNTest extends TikaTest {
private Detector detector;
@@ -88,13 +88,13 @@ public class MimeDetectionWithNNTest {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = MimeDetectionTest.class.getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = MimeDetectionTest.class.getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 1f986da..31df3ec 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -27,12 +27,13 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
-public class MimeDetectionTest {
+public class MimeDetectionTest extends TikaTest {
private MimeTypes mimeTypes;
@@ -136,12 +137,12 @@ public class MimeDetectionTest {
}
private void testUrl(String expected, String url, String file) throws IOException{
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
index 35c75b7..415961f 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
@@ -27,11 +27,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
-public class ProbabilisticMimeDetectionTest {
+public class ProbabilisticMimeDetectionTest extends TikaTest {
private ProbabilisticMimeDetectionSelector proDetector;
@@ -130,12 +131,12 @@ public class ProbabilisticMimeDetectionTest {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
index 5605300..a6dc7f3 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.net.URL;
import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultProbDetector;
import org.apache.tika.metadata.Metadata;
@@ -36,7 +37,7 @@ import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder;
import org.junit.Before;
import org.junit.Test;
-public class ProbabilisticMimeDetectionTestWithTika {
+public class ProbabilisticMimeDetectionTestWithTika extends TikaTest {
private ProbabilisticMimeDetectionSelector proSelector;
private MediaTypeRegistry registry;
@@ -151,12 +152,12 @@ public class ProbabilisticMimeDetectionTestWithTika {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
index f3397d9..696d5e6 100644
--- a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
+++ b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
@@ -18,27 +18,17 @@ package org.apache.tika.osgi;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
import static org.ops4j.pax.exam.CoreOptions.bundle;
import static org.ops4j.pax.exam.CoreOptions.junitBundles;
import static org.ops4j.pax.exam.CoreOptions.options;
-import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
import javax.inject.Inject;
-
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
import java.net.URISyntaxException;
import java.util.Set;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.osgi.TikaService;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.ops4j.pax.exam.Configuration;
@@ -48,7 +38,6 @@ import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
import org.ops4j.pax.exam.spi.reactors.PerMethod;
import org.osgi.framework.Bundle;
import org.osgi.framework.BundleContext;
-import org.xml.sax.ContentHandler;
@RunWith(PaxExam.class)
@ExamReactorStrategy(PerMethod.class)
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb b/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb
deleted file mode 100644
index 0bffdca..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb and /dev/null differ