You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/02/06 04:51:09 UTC
tika git commit: TIKA-1851: move all test resources back to src/test
from src/main in tika-test-resources. Sorry!
Repository: tika
Updated Branches:
refs/heads/2.x 73d720a83 -> 249105aa3
TIKA-1851: move all test resources back to src/test from src/main in tika-test-resources. Sorry!
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/249105aa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/249105aa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/249105aa
Branch: refs/heads/2.x
Commit: 249105aa397f962fae8b0ac1980ae7b20ea82b25
Parents: 73d720a
Author: tballison <ta...@mitre.org>
Authored: Fri Feb 5 22:50:56 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Fri Feb 5 22:50:56 2016 -0500
----------------------------------------------------------------------
tika-batch/pom.xml | 4 +-
tika-parser-modules/pom.xml | 3 +-
tika-parsers/pom.xml | 4 +-
.../tika/config/TikaDetectorConfigTest.java | 1 -
.../apache/tika/parser/mock/MockParserTest.java | 251 +++++++++++++
tika-server/pom.xml | 4 +-
tika-test-resources/pom.xml | 2 +-
.../src/main/java/org/apache/tika/TikaTest.java | 214 -----------
.../tika/config/AbstractTikaConfigTest.java | 50 ---
.../org/apache/tika/parser/mock/MockParser.java | 365 -------------------
.../services/org.apache.tika.parser.Parser | 1 -
.../src/test/java/org/apache/tika/TikaTest.java | 214 +++++++++++
.../tika/config/AbstractTikaConfigTest.java | 50 +++
.../org/apache/tika/parser/mock/MockParser.java | 365 +++++++++++++++++++
.../apache/tika/parser/mock/MockParserTest.java | 247 -------------
.../services/org.apache.tika.parser.Parser | 1 +
16 files changed, 893 insertions(+), 883 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-batch/pom.xml
----------------------------------------------------------------------
diff --git a/tika-batch/pom.xml b/tika-batch/pom.xml
index ffd29b1..bd78cbf 100644
--- a/tika-batch/pom.xml
+++ b/tika-batch/pom.xml
@@ -81,11 +81,13 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-test-resources</artifactId>
<version>${project.version}</version>
+ <type>test-jar</type>
<scope>test</scope>
</dependency>
+
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index 724f0f9..8e71c1b 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -61,9 +61,10 @@
<dependencies>
<!-- Test dependencies -->
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-test-resources</artifactId>
<version>${project.version}</version>
+ <type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 76a78ac..396902a 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -154,11 +154,13 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-test-resources</artifactId>
<version>${project.version}</version>
+ <type>test-jar</type>
<scope>test</scope>
</dependency>
+
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
index 949107c..2125888 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
@@ -30,7 +30,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.mbox.OutlookPSTParser;
import org.apache.tika.parser.microsoft.POIFSContainerDetector;
import org.apache.tika.parser.pkg.ZipContainerDetector;
-import org.junit.Ignore;
import org.junit.Test;
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
new file mode 100644
index 0000000..d222e68
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
@@ -0,0 +1,251 @@
+package org.apache.tika.parser.mock;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Date;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources
+ * or else it will be called by every module that uses it. Um, Yossarian!!!
+ */
+public class MockParserTest extends TikaTest {
+ private final static String M = "/test-documents/mock/";
+ private final static Parser PARSER = new AutoDetectParser();
+
+ @Override
+ public XMLResult getXML(String path, Metadata m) throws Exception {
+ //note that this is specific to MockParserTest with addition of M to the path!
+ InputStream is = getResourceAsStream(M+path);
+ try {
+ return super.getXML(is, PARSER, m);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ @Test
+ public void testExample() throws Exception {
+ Metadata m = new Metadata();
+ PrintStream out = System.out;
+ PrintStream err = System.err;
+ ByteArrayOutputStream outBos = new ByteArrayOutputStream();
+ ByteArrayOutputStream errBos = new ByteArrayOutputStream();
+ PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString());
+ PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString());
+ System.setOut(tmpOut);
+ System.setErr(tmpErr);
+ try {
+ assertThrowable("example.xml", m, IOException.class, "not another IOException");
+ assertMockParser(m);
+ } finally {
+ System.setOut(out);
+ System.setErr(err);
+ }
+ String outString = new String(outBos.toByteArray(), UTF_8);
+ assertContains("writing to System.out", outString);
+
+ String errString = new String(errBos.toByteArray(), UTF_8);
+ assertContains("writing to System.err", errString);
+
+ }
+
+ @Test
+ public void testNothingBad() throws Exception {
+ Metadata m = new Metadata();
+ String content = getXML("nothing_bad.xml", m).xml;
+ assertEquals("Geoffrey Chaucer", m.get("author"));
+ assertContains("<p>And bathed every veyne in swich licour,</p>", content);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointer() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointerNoMsg() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null);
+ assertMockParser(m);
+ }
+
+
+ @Test
+ public void testSleep() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+ String content = getXML("sleep.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testHeavyHang() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+
+ String content = getXML("heavy_hang.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testFakeOOM() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testRealOOM() throws Exception {
+ //Note: we're not actually testing the diff between fake and real oom
+ //i.e. by creating child process and setting different -Xmx or
+ //memory profiling.
+ Metadata m = new Metadata();
+ assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testInterruptibleSleep() {
+ //Without static initialization of the parser, it can take ~1 second after t.start()
+ //before the parser actually calls parse. This is
+ //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc.
+ //This is not thread creation overhead.
+ ParserRunnable r = new ParserRunnable("sleep_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+
+ t.interrupt();
+
+ try {
+ t.join(10000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean shortEnough = elapsed < 2000;//the xml file specifies 3000
+ assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough);
+ }
+
+ @Test
+ public void testNonInterruptibleSleep() {
+ ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ //make sure that the thread has actually started
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ t.interrupt();
+ try {
+ t.join(20000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
+ assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
+ }
+
+ private class ParserRunnable implements Runnable {
+ private final String path;
+ ParserRunnable(String path) {
+ this.path = path;
+ }
+ @Override
+ public void run() {
+ Metadata m = new Metadata();
+ try {
+ getXML(path, m);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ assertMockParser(m);
+ }
+ }
+ }
+
+ private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) {
+
+ try {
+ getXML(path, m);
+ } catch (Throwable t) {
+ //if this is a throwable wrapped in a TikaException, use the cause
+ if (t instanceof TikaException && t.getCause() != null) {
+ t = t.getCause();
+ }
+ if (! (t.getClass().isAssignableFrom(expected))){
+ fail(t.getClass() +" is not assignable from "+expected);
+ }
+ if (message != null) {
+ assertEquals(message, t.getMessage());
+ }
+ }
+ }
+
+ private void assertMockParser(Metadata m) {
+ String[] parsers = m.getValues("X-Parsed-By");
+ //make sure that it was actually parsed by mock.
+ boolean parsedByMock = false;
+ for (String parser : parsers) {
+ if (parser.equals("org.apache.tika.parser.mock.MockParser")) {
+ parsedByMock = true;
+ break;
+ }
+ }
+ assertTrue("mock parser should have been called", parsedByMock);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-server/pom.xml
----------------------------------------------------------------------
diff --git a/tika-server/pom.xml b/tika-server/pom.xml
index 4634068..958cd74 100644
--- a/tika-server/pom.xml
+++ b/tika-server/pom.xml
@@ -120,11 +120,13 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>${project.groupId}</groupId>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-test-resources</artifactId>
<version>${project.version}</version>
+ <type>test-jar</type>
<scope>test</scope>
</dependency>
+
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/pom.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/pom.xml b/tika-test-resources/pom.xml
index 7574e0c..5df07f4 100644
--- a/tika-test-resources/pom.xml
+++ b/tika-test-resources/pom.xml
@@ -78,7 +78,7 @@
<executions>
<execution>
<goals>
- <goal>jar</goal>
+ <goal>test-jar</goal>
</goals>
</execution>
</executions>
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java b/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java
deleted file mode 100644
index 2c6f21f..0000000
--- a/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.extractor.EmbeddedResourceHandler;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.xml.sax.ContentHandler;
-
-/**
- * Parent class of Tika tests
- */
-public abstract class TikaTest {
- /**
- * This method will give you back the filename incl. the absolute path name
- * to the resource. If the resource does not exist it will give you back the
- * resource name incl. the path.
- *
- * @param name
- * The named resource to search for.
- * @return an absolute path incl. the name which is in the same directory as
- * the the class you've called it from.
- */
- public File getResourceAsFile(String name) throws URISyntaxException {
- URL url = this.getClass().getResource(name);
- if (url != null) {
- return new File(url.toURI());
- } else {
- // We have a file which does not exists
- // We got the path
- url = this.getClass().getResource(".");
- File file = new File(new File(url.toURI()), name);
- if (file == null) {
- fail("Unable to find requested file " + name);
- }
- return file;
- }
- }
-
- public InputStream getResourceAsStream(String name) {
- InputStream stream = this.getClass().getResourceAsStream(name);
- if (stream == null) {
- fail("Unable to find requested resource " + name);
- }
- return stream;
- }
-
- public static void assertContains(String needle, String haystack) {
- assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
- }
- public static <T> void assertContains(T needle, Collection<? extends T> haystack) {
- assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
- }
-
- public static void assertNotContained(String needle, String haystack) {
- assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
- }
- public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) {
- assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
- }
-
- protected static class XMLResult {
- public final String xml;
- public final Metadata metadata;
-
- public XMLResult(String xml, Metadata metadata) {
- this.xml = xml;
- this.metadata = metadata;
- }
- }
-
- protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata);
- }
-
- protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata);
- }
-
- protected XMLResult getXML(String filePath) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
- }
-
- protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
-
- try {
- ContentHandler handler = new ToXMLContentHandler();
- parser.parse(input, handler, metadata, context);
- return new XMLResult(handler.toString(), metadata);
- } finally {
- input.close();
- }
- }
-
- /**
- * Basic text extraction.
- * <p>
- * Tries to close input stream after processing.
- */
- public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
- ContentHandler handler = new BodyContentHandler(1000000);
- try {
- parser.parse(is, handler, metadata, context);
- } finally {
- is.close();
- }
- return handler.toString();
- }
-
- public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{
- return getText(is, parser, new ParseContext(), metadata);
- }
-
- public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{
- return getText(is, parser, context, new Metadata());
- }
-
- public String getText(InputStream is, Parser parser) throws Exception{
- return getText(is, parser, new ParseContext(), new Metadata());
- }
-
- /**
- * Keeps track of media types and file names recursively.
- *
- */
- public static class TrackingHandler implements EmbeddedResourceHandler {
- public List<String> filenames = new ArrayList<String>();
- public List<MediaType> mediaTypes = new ArrayList<MediaType>();
-
- private final Set<MediaType> skipTypes;
-
- public TrackingHandler() {
- skipTypes = new HashSet<MediaType>();
- }
-
- public TrackingHandler(Set<MediaType> skipTypes) {
- this.skipTypes = skipTypes;
- }
-
- @Override
- public void handle(String filename, MediaType mediaType,
- InputStream stream) {
- if (skipTypes.contains(mediaType)) {
- return;
- }
- mediaTypes.add(mediaType);
- filenames.add(filename);
- }
- }
-
- /**
- * Copies byte[] of embedded documents into a List.
- */
- public static class ByteCopyingHandler implements EmbeddedResourceHandler {
-
- public List<byte[]> bytes = new ArrayList<byte[]>();
-
- @Override
- public void handle(String filename, MediaType mediaType,
- InputStream stream) {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- if (! stream.markSupported()) {
- stream = TikaInputStream.get(stream);
- }
- stream.mark(0);
- try {
- IOUtils.copy(stream, os);
- bytes.add(os.toByteArray());
- stream.reset();
- } catch (IOException e) {
- //swallow
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java
deleted file mode 100644
index 1b104f7..0000000
--- a/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import static org.junit.Assert.assertNotNull;
-
-import java.net.URL;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.parser.ParseContext;
-import org.junit.After;
-
-/**
- * Parent of Junit test classes for {@link TikaConfig}, including
- * Tika Core based ones, and ones in Tika Parsers that do things
- * that tika-core's can't, do due to a need for the
- * full set of "real" classes of parsers / detectors
- */
-public abstract class AbstractTikaConfigTest extends TikaTest {
- protected static ParseContext context = new ParseContext();
-
- protected static String getConfigPath(String config) throws Exception {
- URL url = TikaConfig.class.getResource(config);
- assertNotNull("Test Tika Config not found: " + config, url);
- return url.toExternalForm();
- }
- protected static TikaConfig getConfig(String config) throws Exception {
- System.setProperty("tika.config", getConfigPath(config));
- return new TikaConfig();
- }
-
- @After
- public void resetConfig() {
- System.clearProperty("tika.config");
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java b/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java
deleted file mode 100644
index a920502..0000000
--- a/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java
+++ /dev/null
@@ -1,365 +0,0 @@
-package org.apache.tika.parser.mock;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Constructor;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.w3c.dom.Document;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * This class enables mocking of parser behavior for use in testing
- * wrappers and drivers of parsers.
- * <p>
- * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation
- * of all the options for this MockParser.
- * <p>
- * Tests for this class are in tika-parsers.
- * <p>
- * See also {@link org.apache.tika.parser.DummyParser} for another option.
- */
-
-public class MockParser extends AbstractParser {
-
- private static final long serialVersionUID = 1L;
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- Set<MediaType> types = new HashSet<MediaType>();
- MediaType type = MediaType.application("mock+xml");
- types.add(type);
- return types;
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
- Document doc = null;
- DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
- DocumentBuilder docBuilder = null;
- try {
- docBuilder = fact.newDocumentBuilder();
- doc = docBuilder.parse(stream);
- } catch (ParserConfigurationException e) {
- throw new IOException(e);
- } catch (SAXException e) {
- throw new IOException(e);
- }
- Node root = doc.getDocumentElement();
- NodeList actions = root.getChildNodes();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- for (int i = 0; i < actions.getLength(); i++) {
- executeAction(actions.item(i), metadata, context, xhtml);
- }
- xhtml.endDocument();
- }
-
- private void executeAction(Node action, Metadata metadata, ParseContext context,
- XHTMLContentHandler xhtml) throws SAXException,
- IOException, TikaException {
-
- if (action.getNodeType() != 1) {
- return;
- }
-
- String name = action.getNodeName();
- if ("metadata".equals(name)) {
- metadata(action, metadata);
- } else if("write".equals(name)) {
- write(action, xhtml);
- } else if ("throw".equals(name)) {
- throwIt(action);
- } else if ("hang".equals(name)) {
- hang(action);
- } else if ("oom".equals(name)) {
- kabOOM();
- } else if ("print_out".equals(name) || "print_err".equals(name)){
- print(action, name);
- } else if ("embedded".equals(name)) {
- handleEmbedded(action, xhtml, context);
- } else if ("throwIllegalChars".equals(name)) {
- throwIllegalChars();
- } else {
- throw new IllegalArgumentException("Didn't recognize mock action: "+name);
- }
- }
-
- private void throwIllegalChars() throws IOException {
- throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003");
- }
-
- private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context)
- throws TikaException, SAXException, IOException {
- String fileName = "";
- String contentType = "";
- NamedNodeMap attrs = action.getAttributes();
- if (attrs != null) {
- Node n = attrs.getNamedItem("filename");
- if (n != null) {
- fileName = n.getNodeValue();
- }
- n = attrs.getNamedItem("content-type");
- if (n != null) {
- contentType = n.getNodeValue();
- }
- }
-
- String embeddedText = action.getTextContent();
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context);
- Metadata m = new Metadata();
- m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
- if (! "".equals(contentType)) {
- m.set(Metadata.CONTENT_TYPE, contentType);
- }
- InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8));
-
- extractor.parseEmbedded(
- is,
- new EmbeddedContentHandler(handler),
- m, true);
-
-
- }
-
- protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
- EmbeddedDocumentExtractor extractor =
- context.get(EmbeddedDocumentExtractor.class);
- if (extractor == null) {
- Parser p = context.get(Parser.class);
- if (p == null) {
- context.set(Parser.class, new MockParser());
- }
- extractor = new ParsingEmbeddedDocumentExtractor(context);
- }
- return extractor;
- }
-
- private void print(Node action, String name) {
- String content = action.getTextContent();
- if ("print_out".equals(name)) {
- System.out.println(content);
- } else if ("print_err".equals(name)) {
- System.err.println(content);
- } else {
- throw new IllegalArgumentException("must be print_out or print_err");
- }
- }
- private void hang(Node action) {
- boolean interruptible = true;
- boolean heavy = false;
- long millis = -1;
- long pulseMillis = -1;
- NamedNodeMap attrs = action.getAttributes();
- Node iNode = attrs.getNamedItem("interruptible");
- if (iNode != null) {
- interruptible = ("true".equals(iNode.getNodeValue()));
- }
- Node hNode = attrs.getNamedItem("heavy");
- if (hNode != null) {
- heavy = ("true".equals(hNode.getNodeValue()));
- }
-
- Node mNode = attrs.getNamedItem("millis");
- if (mNode == null) {
- throw new RuntimeException("Must specify \"millis\" attribute for hang.");
- }
- String millisString = mNode.getNodeValue();
- try {
- millis = Long.parseLong(millisString);
- } catch (NumberFormatException e) {
- throw new RuntimeException("Value for \"millis\" attribute must be a long.");
- }
-
- if (heavy) {
- Node pNode = attrs.getNamedItem("pulse_millis");
- if (pNode == null) {
- throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\"");
- }
- String pulseMillisString = mNode.getNodeValue();
- try {
- pulseMillis = Long.parseLong(pulseMillisString);
- } catch (NumberFormatException e) {
- throw new RuntimeException("Value for \"millis\" attribute must be a long.");
- }
- }
- if (heavy) {
- hangHeavy(millis, pulseMillis, interruptible);
- } else {
- sleep(millis, interruptible);
- }
- }
-
- private void throwIt(Node action) throws IOException,
- SAXException, TikaException {
- NamedNodeMap attrs = action.getAttributes();
- String className = attrs.getNamedItem("class").getNodeValue();
- String msg = action.getTextContent();
- throwIt(className, msg);
- }
-
- private void metadata(Node action, Metadata metadata) {
- NamedNodeMap attrs = action.getAttributes();
- //throws npe unless there is a name
- String name = attrs.getNamedItem("name").getNodeValue();
- String value = action.getTextContent();
- Node actionType = attrs.getNamedItem("action");
- if (actionType == null) {
- metadata.add(name, value);
- } else {
- if ("set".equals(actionType.getNodeValue())) {
- metadata.set(name, value);
- } else {
- metadata.add(name, value);
- }
- }
- }
-
- private void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
- NamedNodeMap attrs = action.getAttributes();
- Node eNode = attrs.getNamedItem("element");
- String elementType = "p";
- if (eNode != null) {
- elementType = eNode.getTextContent();
- }
- String text = action.getTextContent();
- xhtml.startElement(elementType);
- xhtml.characters(text);
- xhtml.endElement(elementType);
- }
-
-
- private void throwIt(String className, String msg) throws IOException,
- SAXException, TikaException {
- Throwable t = null;
- if (msg == null || msg.equals("")) {
- try {
- t = (Throwable) Class.forName(className).newInstance();
- } catch (Exception e) {
- throw new RuntimeException("couldn't create throwable class:"+className, e);
- }
- } else {
- try {
- Class<?> clazz = Class.forName(className);
- Constructor<?> con = clazz.getConstructor(String.class);
- t = (Throwable) con.newInstance(msg);
- } catch (Exception e) {
- throw new RuntimeException("couldn't create throwable class:" + className, e);
- }
- }
- if (t instanceof SAXException) {
- throw (SAXException)t;
- } else if (t instanceof IOException) {
- throw (IOException) t;
- } else if (t instanceof TikaException) {
- throw (TikaException) t;
- } else if (t instanceof Error) {
- throw (Error) t;
- } else if (t instanceof RuntimeException) {
- throw (RuntimeException) t;
- } else {
- //wrap the throwable in a RuntimeException
- throw new RuntimeException(t);
- }
- }
-
- private void kabOOM() {
- List<int[]> ints = new ArrayList<int[]>();
-
- while (true) {
- int[] intArr = new int[32000];
- ints.add(intArr);
- }
- }
-
- private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) {
- //do some heavy computation and occasionally check for
- //whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
- //or whether the thread was interrupted
- long start = new Date().getTime();
- int lastChecked = 0;
- while (true) {
- for (int i = 1; i < Integer.MAX_VALUE; i++) {
- for (int j = 1; j < Integer.MAX_VALUE; j++) {
- double div = (double) i / (double) j;
- lastChecked++;
- if (lastChecked > pulseCheckMillis) {
- lastChecked = 0;
- if (interruptible && Thread.currentThread().isInterrupted()) {
- return;
- }
- long elapsed = new Date().getTime()-start;
- if (elapsed > maxMillis) {
- return;
- }
- }
- }
- }
- }
- }
-
- private void sleep(long maxMillis, boolean isInterruptible) {
- long start = new Date().getTime();
- long millisRemaining = maxMillis;
- while (true) {
- try {
- Thread.sleep(millisRemaining);
- } catch (InterruptedException e) {
- if (isInterruptible) {
- return;
- }
- }
- long elapsed = new Date().getTime()-start;
- millisRemaining = maxMillis - elapsed;
- if (millisRemaining <= 0) {
- break;
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
deleted file mode 100644
index 69bfdeb..0000000
--- a/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.tika.parser.mock.MockParser
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java b/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java
new file mode 100644
index 0000000..2c6f21f
--- /dev/null
+++ b/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Parent class of Tika tests
+ */
+public abstract class TikaTest {
+ /**
+ * This method will give you back the filename incl. the absolute path name
+ * to the resource. If the resource does not exist it will give you back the
+ * resource name incl. the path.
+ *
+ * @param name
+ * The named resource to search for.
+ * @return an absolute path incl. the name which is in the same directory as
+ * the the class you've called it from.
+ */
+ public File getResourceAsFile(String name) throws URISyntaxException {
+ URL url = this.getClass().getResource(name);
+ if (url != null) {
+ return new File(url.toURI());
+ } else {
+ // We have a file which does not exists
+ // We got the path
+ url = this.getClass().getResource(".");
+ File file = new File(new File(url.toURI()), name);
+ if (file == null) {
+ fail("Unable to find requested file " + name);
+ }
+ return file;
+ }
+ }
+
+ public InputStream getResourceAsStream(String name) {
+ InputStream stream = this.getClass().getResourceAsStream(name);
+ if (stream == null) {
+ fail("Unable to find requested resource " + name);
+ }
+ return stream;
+ }
+
+ public static void assertContains(String needle, String haystack) {
+ assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
+ }
+ public static <T> void assertContains(T needle, Collection<? extends T> haystack) {
+ assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
+ }
+
+ public static void assertNotContained(String needle, String haystack) {
+ assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
+ }
+ public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) {
+ assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
+ }
+
+ protected static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
+ return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata);
+ }
+
+ protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
+ return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata);
+ }
+
+ protected XMLResult getXML(String filePath) throws Exception {
+ return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
+ }
+
+ protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+
+ try {
+ ContentHandler handler = new ToXMLContentHandler();
+ parser.parse(input, handler, metadata, context);
+ return new XMLResult(handler.toString(), metadata);
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Basic text extraction.
+ * <p>
+ * Tries to close input stream after processing.
+ */
+ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
+ ContentHandler handler = new BodyContentHandler(1000000);
+ try {
+ parser.parse(is, handler, metadata, context);
+ } finally {
+ is.close();
+ }
+ return handler.toString();
+ }
+
+ public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{
+ return getText(is, parser, new ParseContext(), metadata);
+ }
+
+ public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{
+ return getText(is, parser, context, new Metadata());
+ }
+
+ public String getText(InputStream is, Parser parser) throws Exception{
+ return getText(is, parser, new ParseContext(), new Metadata());
+ }
+
+ /**
+ * Keeps track of media types and file names recursively.
+ *
+ */
+ public static class TrackingHandler implements EmbeddedResourceHandler {
+ public List<String> filenames = new ArrayList<String>();
+ public List<MediaType> mediaTypes = new ArrayList<MediaType>();
+
+ private final Set<MediaType> skipTypes;
+
+ public TrackingHandler() {
+ skipTypes = new HashSet<MediaType>();
+ }
+
+ public TrackingHandler(Set<MediaType> skipTypes) {
+ this.skipTypes = skipTypes;
+ }
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ if (skipTypes.contains(mediaType)) {
+ return;
+ }
+ mediaTypes.add(mediaType);
+ filenames.add(filename);
+ }
+ }
+
+ /**
+ * Copies byte[] of embedded documents into a List.
+ */
+ public static class ByteCopyingHandler implements EmbeddedResourceHandler {
+
+ public List<byte[]> bytes = new ArrayList<byte[]>();
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ if (! stream.markSupported()) {
+ stream = TikaInputStream.get(stream);
+ }
+ stream.mark(0);
+ try {
+ IOUtils.copy(stream, os);
+ bytes.add(os.toByteArray());
+ stream.reset();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
new file mode 100644
index 0000000..1b104f7
--- /dev/null
+++ b/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.Assert.assertNotNull;
+
+import java.net.URL;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.ParseContext;
+import org.junit.After;
+
+/**
+ * Parent of Junit test classes for {@link TikaConfig}, including
+ * Tika Core based ones, and ones in Tika Parsers that do things
+ * that tika-core's can't, do due to a need for the
+ * full set of "real" classes of parsers / detectors
+ */
+public abstract class AbstractTikaConfigTest extends TikaTest {
+ protected static ParseContext context = new ParseContext();
+
+ protected static String getConfigPath(String config) throws Exception {
+ URL url = TikaConfig.class.getResource(config);
+ assertNotNull("Test Tika Config not found: " + config, url);
+ return url.toExternalForm();
+ }
+ protected static TikaConfig getConfig(String config) throws Exception {
+ System.setProperty("tika.config", getConfigPath(config));
+ return new TikaConfig();
+ }
+
+ @After
+ public void resetConfig() {
+ System.clearProperty("tika.config");
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java
new file mode 100644
index 0000000..a920502
--- /dev/null
+++ b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -0,0 +1,365 @@
+package org.apache.tika.parser.mock;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This class enables mocking of parser behavior for use in testing
+ * wrappers and drivers of parsers.
+ * <p>
+ * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation
+ * of all the options for this MockParser.
+ * <p>
+ * Tests for this class are in tika-parsers.
+ * <p>
+ * See also {@link org.apache.tika.parser.DummyParser} for another option.
+ */
+
+public class MockParser extends AbstractParser {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ Set<MediaType> types = new HashSet<MediaType>();
+ MediaType type = MediaType.application("mock+xml");
+ types.add(type);
+ return types;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ Document doc = null;
+ DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = null;
+ try {
+ docBuilder = fact.newDocumentBuilder();
+ doc = docBuilder.parse(stream);
+ } catch (ParserConfigurationException e) {
+ throw new IOException(e);
+ } catch (SAXException e) {
+ throw new IOException(e);
+ }
+ Node root = doc.getDocumentElement();
+ NodeList actions = root.getChildNodes();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ for (int i = 0; i < actions.getLength(); i++) {
+ executeAction(actions.item(i), metadata, context, xhtml);
+ }
+ xhtml.endDocument();
+ }
+
+ private void executeAction(Node action, Metadata metadata, ParseContext context,
+ XHTMLContentHandler xhtml) throws SAXException,
+ IOException, TikaException {
+
+ if (action.getNodeType() != 1) {
+ return;
+ }
+
+ String name = action.getNodeName();
+ if ("metadata".equals(name)) {
+ metadata(action, metadata);
+ } else if("write".equals(name)) {
+ write(action, xhtml);
+ } else if ("throw".equals(name)) {
+ throwIt(action);
+ } else if ("hang".equals(name)) {
+ hang(action);
+ } else if ("oom".equals(name)) {
+ kabOOM();
+ } else if ("print_out".equals(name) || "print_err".equals(name)){
+ print(action, name);
+ } else if ("embedded".equals(name)) {
+ handleEmbedded(action, xhtml, context);
+ } else if ("throwIllegalChars".equals(name)) {
+ throwIllegalChars();
+ } else {
+ throw new IllegalArgumentException("Didn't recognize mock action: "+name);
+ }
+ }
+
+ private void throwIllegalChars() throws IOException {
+ throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003");
+ }
+
+ private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context)
+ throws TikaException, SAXException, IOException {
+ String fileName = "";
+ String contentType = "";
+ NamedNodeMap attrs = action.getAttributes();
+ if (attrs != null) {
+ Node n = attrs.getNamedItem("filename");
+ if (n != null) {
+ fileName = n.getNodeValue();
+ }
+ n = attrs.getNamedItem("content-type");
+ if (n != null) {
+ contentType = n.getNodeValue();
+ }
+ }
+
+ String embeddedText = action.getTextContent();
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context);
+ Metadata m = new Metadata();
+ m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
+ if (! "".equals(contentType)) {
+ m.set(Metadata.CONTENT_TYPE, contentType);
+ }
+ InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8));
+
+ extractor.parseEmbedded(
+ is,
+ new EmbeddedContentHandler(handler),
+ m, true);
+
+
+ }
+
+ protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
+ EmbeddedDocumentExtractor extractor =
+ context.get(EmbeddedDocumentExtractor.class);
+ if (extractor == null) {
+ Parser p = context.get(Parser.class);
+ if (p == null) {
+ context.set(Parser.class, new MockParser());
+ }
+ extractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+ return extractor;
+ }
+
+ private void print(Node action, String name) {
+ String content = action.getTextContent();
+ if ("print_out".equals(name)) {
+ System.out.println(content);
+ } else if ("print_err".equals(name)) {
+ System.err.println(content);
+ } else {
+ throw new IllegalArgumentException("must be print_out or print_err");
+ }
+ }
+ private void hang(Node action) {
+ boolean interruptible = true;
+ boolean heavy = false;
+ long millis = -1;
+ long pulseMillis = -1;
+ NamedNodeMap attrs = action.getAttributes();
+ Node iNode = attrs.getNamedItem("interruptible");
+ if (iNode != null) {
+ interruptible = ("true".equals(iNode.getNodeValue()));
+ }
+ Node hNode = attrs.getNamedItem("heavy");
+ if (hNode != null) {
+ heavy = ("true".equals(hNode.getNodeValue()));
+ }
+
+ Node mNode = attrs.getNamedItem("millis");
+ if (mNode == null) {
+ throw new RuntimeException("Must specify \"millis\" attribute for hang.");
+ }
+ String millisString = mNode.getNodeValue();
+ try {
+ millis = Long.parseLong(millisString);
+ } catch (NumberFormatException e) {
+ throw new RuntimeException("Value for \"millis\" attribute must be a long.");
+ }
+
+ if (heavy) {
+ Node pNode = attrs.getNamedItem("pulse_millis");
+ if (pNode == null) {
+ throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\"");
+ }
+ String pulseMillisString = mNode.getNodeValue();
+ try {
+ pulseMillis = Long.parseLong(pulseMillisString);
+ } catch (NumberFormatException e) {
+ throw new RuntimeException("Value for \"millis\" attribute must be a long.");
+ }
+ }
+ if (heavy) {
+ hangHeavy(millis, pulseMillis, interruptible);
+ } else {
+ sleep(millis, interruptible);
+ }
+ }
+
+ private void throwIt(Node action) throws IOException,
+ SAXException, TikaException {
+ NamedNodeMap attrs = action.getAttributes();
+ String className = attrs.getNamedItem("class").getNodeValue();
+ String msg = action.getTextContent();
+ throwIt(className, msg);
+ }
+
+ private void metadata(Node action, Metadata metadata) {
+ NamedNodeMap attrs = action.getAttributes();
+ //throws npe unless there is a name
+ String name = attrs.getNamedItem("name").getNodeValue();
+ String value = action.getTextContent();
+ Node actionType = attrs.getNamedItem("action");
+ if (actionType == null) {
+ metadata.add(name, value);
+ } else {
+ if ("set".equals(actionType.getNodeValue())) {
+ metadata.set(name, value);
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ }
+
+ private void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
+ NamedNodeMap attrs = action.getAttributes();
+ Node eNode = attrs.getNamedItem("element");
+ String elementType = "p";
+ if (eNode != null) {
+ elementType = eNode.getTextContent();
+ }
+ String text = action.getTextContent();
+ xhtml.startElement(elementType);
+ xhtml.characters(text);
+ xhtml.endElement(elementType);
+ }
+
+
+ private void throwIt(String className, String msg) throws IOException,
+ SAXException, TikaException {
+ Throwable t = null;
+ if (msg == null || msg.equals("")) {
+ try {
+ t = (Throwable) Class.forName(className).newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("couldn't create throwable class:"+className, e);
+ }
+ } else {
+ try {
+ Class<?> clazz = Class.forName(className);
+ Constructor<?> con = clazz.getConstructor(String.class);
+ t = (Throwable) con.newInstance(msg);
+ } catch (Exception e) {
+ throw new RuntimeException("couldn't create throwable class:" + className, e);
+ }
+ }
+ if (t instanceof SAXException) {
+ throw (SAXException)t;
+ } else if (t instanceof IOException) {
+ throw (IOException) t;
+ } else if (t instanceof TikaException) {
+ throw (TikaException) t;
+ } else if (t instanceof Error) {
+ throw (Error) t;
+ } else if (t instanceof RuntimeException) {
+ throw (RuntimeException) t;
+ } else {
+ //wrap the throwable in a RuntimeException
+ throw new RuntimeException(t);
+ }
+ }
+
+ private void kabOOM() {
+ List<int[]> ints = new ArrayList<int[]>();
+
+ while (true) {
+ int[] intArr = new int[32000];
+ ints.add(intArr);
+ }
+ }
+
+ private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) {
+ //do some heavy computation and occasionally check for
+ //whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
+ //or whether the thread was interrupted
+ long start = new Date().getTime();
+ int lastChecked = 0;
+ while (true) {
+ for (int i = 1; i < Integer.MAX_VALUE; i++) {
+ for (int j = 1; j < Integer.MAX_VALUE; j++) {
+ double div = (double) i / (double) j;
+ lastChecked++;
+ if (lastChecked > pulseCheckMillis) {
+ lastChecked = 0;
+ if (interruptible && Thread.currentThread().isInterrupted()) {
+ return;
+ }
+ long elapsed = new Date().getTime()-start;
+ if (elapsed > maxMillis) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void sleep(long maxMillis, boolean isInterruptible) {
+ long start = new Date().getTime();
+ long millisRemaining = maxMillis;
+ while (true) {
+ try {
+ Thread.sleep(millisRemaining);
+ } catch (InterruptedException e) {
+ if (isInterruptible) {
+ return;
+ }
+ }
+ long elapsed = new Date().getTime()-start;
+ millisRemaining = maxMillis - elapsed;
+ if (millisRemaining <= 0) {
+ break;
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
deleted file mode 100644
index 29fa3af..0000000
--- a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
+++ /dev/null
@@ -1,247 +0,0 @@
-package org.apache.tika.parser.mock;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintStream;
-import java.util.Date;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-
-public class MockParserTest extends TikaTest {
- private final static String M = "/test-documents/mock/";
- private final static Parser PARSER = new AutoDetectParser();
-
- @Override
- public XMLResult getXML(String path, Metadata m) throws Exception {
- //note that this is specific to MockParserTest with addition of M to the path!
- InputStream is = getResourceAsStream(M+path);
- try {
- return super.getXML(is, PARSER, m);
- } finally {
- IOUtils.closeQuietly(is);
- }
- }
-
- @Test
- public void testExample() throws Exception {
- Metadata m = new Metadata();
- PrintStream out = System.out;
- PrintStream err = System.err;
- ByteArrayOutputStream outBos = new ByteArrayOutputStream();
- ByteArrayOutputStream errBos = new ByteArrayOutputStream();
- PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString());
- PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString());
- System.setOut(tmpOut);
- System.setErr(tmpErr);
- try {
- assertThrowable("example.xml", m, IOException.class, "not another IOException");
- assertMockParser(m);
- } finally {
- System.setOut(out);
- System.setErr(err);
- }
- String outString = new String(outBos.toByteArray(), UTF_8);
- assertContains("writing to System.out", outString);
-
- String errString = new String(errBos.toByteArray(), UTF_8);
- assertContains("writing to System.err", errString);
-
- }
-
- @Test
- public void testNothingBad() throws Exception {
- Metadata m = new Metadata();
- String content = getXML("nothing_bad.xml", m).xml;
- assertEquals("Geoffrey Chaucer", m.get("author"));
- assertContains("<p>And bathed every veyne in swich licour,</p>", content);
- assertMockParser(m);
- }
-
- @Test
- public void testNullPointer() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception");
- assertMockParser(m);
- }
-
- @Test
- public void testNullPointerNoMsg() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null);
- assertMockParser(m);
- }
-
-
- @Test
- public void testSleep() throws Exception {
- long start = new Date().getTime();
- Metadata m = new Metadata();
- String content = getXML("sleep.xml", m).xml;
- assertMockParser(m);
- long elapsed = new Date().getTime()-start;
- //should sleep for at least 3000
- boolean enoughTimeHasElapsed = elapsed > 2000;
- assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed);
- assertMockParser(m);
- }
-
- @Test
- public void testHeavyHang() throws Exception {
- long start = new Date().getTime();
- Metadata m = new Metadata();
-
- String content = getXML("heavy_hang.xml", m).xml;
- assertMockParser(m);
- long elapsed = new Date().getTime()-start;
- //should sleep for at least 3000
- boolean enoughTimeHasElapsed = elapsed > 2000;
- assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed);
- assertMockParser(m);
- }
-
- @Test
- public void testFakeOOM() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom");
- assertMockParser(m);
- }
-
- @Test
- public void testRealOOM() throws Exception {
- //Note: we're not actually testing the diff between fake and real oom
- //i.e. by creating child process and setting different -Xmx or
- //memory profiling.
- Metadata m = new Metadata();
- assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space");
- assertMockParser(m);
- }
-
- @Test
- public void testInterruptibleSleep() {
- //Without static initialization of the parser, it can take ~1 second after t.start()
- //before the parser actually calls parse. This is
- //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc.
- //This is not thread creation overhead.
- ParserRunnable r = new ParserRunnable("sleep_interruptible.xml");
- Thread t = new Thread(r);
- t.start();
- long start = new Date().getTime();
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- //swallow
- }
-
- t.interrupt();
-
- try {
- t.join(10000);
- } catch (InterruptedException e) {
- //swallow
- }
- long elapsed = new Date().getTime()-start;
- boolean shortEnough = elapsed < 2000;//the xml file specifies 3000
- assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough);
- }
-
- @Test
- public void testNonInterruptibleSleep() {
- ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml");
- Thread t = new Thread(r);
- t.start();
- long start = new Date().getTime();
- try {
- //make sure that the thread has actually started
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- //swallow
- }
- t.interrupt();
- try {
- t.join(20000);
- } catch (InterruptedException e) {
- //swallow
- }
- long elapsed = new Date().getTime()-start;
- boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
- assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
- }
-
- private class ParserRunnable implements Runnable {
- private final String path;
- ParserRunnable(String path) {
- this.path = path;
- }
- @Override
- public void run() {
- Metadata m = new Metadata();
- try {
- getXML(path, m);
- } catch (Exception e) {
- throw new RuntimeException(e);
- } finally {
- assertMockParser(m);
- }
- }
- }
-
- private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) {
-
- try {
- getXML(path, m);
- } catch (Throwable t) {
- //if this is a throwable wrapped in a TikaException, use the cause
- if (t instanceof TikaException && t.getCause() != null) {
- t = t.getCause();
- }
- if (! (t.getClass().isAssignableFrom(expected))){
- fail(t.getClass() +" is not assignable from "+expected);
- }
- if (message != null) {
- assertEquals(message, t.getMessage());
- }
- }
- }
-
- private void assertMockParser(Metadata m) {
- String[] parsers = m.getValues("X-Parsed-By");
- //make sure that it was actually parsed by mock.
- boolean parsedByMock = false;
- for (String parser : parsers) {
- if (parser.equals("org.apache.tika.parser.mock.MockParser")) {
- parsedByMock = true;
- break;
- }
- }
- assertTrue("mock parser should have been called", parsedByMock);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..69bfdeb
--- /dev/null
+++ b/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1 @@
+org.apache.tika.parser.mock.MockParser
\ No newline at end of file