You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/03/31 03:54:41 UTC
svn commit: r1670237 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/
tika-app/src/test/java/org/apache/tika/cli/
tika-batch/src/main/java/org/apache/tika/batch/builders/
tika-batch/src/main/resources/org/apache/tika/batch/fs/ tika-batch/s...
Author: tallison
Date: Tue Mar 31 01:54:40 2015
New Revision: 1670237
URL: http://svn.apache.org/r1670237
Log:
TIKA-1330: add integration tests to TikaCLITest
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Mar 31 01:54:40 2015
@@ -125,7 +125,7 @@ public class TikaCLI {
String[] batchArgs = BatchCommandLineBuilder.build(args);
BatchProcessDriverCLI batchDriver = new BatchProcessDriverCLI(batchArgs);
batchDriver.execute();
- System.exit(0);
+ return;
}
BasicConfigurator.configure(
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Tue Mar 31 01:54:40 2015
@@ -16,16 +16,26 @@
*/
package org.apache.tika.cli;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
import java.io.PrintStream;
+import java.io.Reader;
import java.net.URI;
+import java.util.List;
+
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -378,4 +388,96 @@ public class TikaCLITest {
assertTrue(content.contains("\\n\\nembed_0"));
}
+ @Test
+ public void testSimplestBatchIntegration() throws Exception {
+ File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+ tempDir.delete();
+ tempDir.mkdir();
+ ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+ PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+ OutputStream os = System.out;
+ System.setOut(writer);
+ try {
+ String[] params = {escape(testDataFile.getAbsolutePath()),
+ escape(tempDir.getAbsolutePath())};
+ TikaCLI.main(params);
+
+ StringBuffer allFiles = new StringBuffer();
+ assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
+ assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
+ } finally {
+ //reset in case something went horribly wrong
+ System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+ FileUtils.deleteDirectory(tempDir);
+ }
+ }
+
+ @Test
+ public void testBasicBatchIntegration() throws Exception {
+ File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+ tempDir.delete();
+ tempDir.mkdir();
+ ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+ PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+ OutputStream os = System.out;
+ System.setOut(writer);
+ try {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "2",
+ "-reporterSleepMillis", "100"};//report often to make sure
+ TikaCLI.main(params);
+
+ StringBuffer allFiles = new StringBuffer();
+ assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
+ assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
+ String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8);
+
+ assertEquals(-1, sysOutString.indexOf("There are 3 file processors still active"));
+ assertTrue(sysOutString.indexOf("There are 2 file processors") > -1);
+ } finally {
+ //reset in case something went horribly wrong
+ System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+ FileUtils.deleteDirectory(tempDir);
+ }
+ }
+
+ @Test
+ public void testJsonRecursiveBatchIntegration() throws Exception {
+ File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+ tempDir.delete();
+ tempDir.mkdir();
+ ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+ PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+ OutputStream os = System.out;
+ System.setOut(writer);
+ Reader reader = null;
+ try {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "10",
+ "-J", //recursive Json
+ "-t" //plain text in content
+ };
+ TikaCLI.main(params);
+ reader = new InputStreamReader(
+ new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
+ } finally {
+ IOUtils.closeQuietly(reader);
+ //reset in case something went horribly wrong
+ System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+ FileUtils.deleteDirectory(tempDir);
+ }
+ }
+
+
+ public static String escape(String path) {
+ if (path.indexOf(' ') > -1){
+ return '"'+path+'"';
+ }
+ return path;
+ }
}
Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java Tue Mar 31 01:54:40 2015
@@ -30,13 +30,13 @@ public class SimpleLogReporterBuilder im
@Override
public StatusReporter build(FileResourceCrawler crawler, ConsumersManager consumersManager,
- Node n, Map<String, String> commandlineArguments) {
-
- Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments);
- long sleepMillis = PropsUtil.getLong(attributes.get("sleepMillis"), 1000L);
- long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L);
- StatusReporter reporter = new StatusReporter(crawler, consumersManager);
- reporter.setSleepMillis(sleepMillis);
+ Node n, Map<String, String> commandlineArguments) {
+
+ Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments);
+ long sleepMillis = PropsUtil.getLong(attributes.get("reporterSleepMillis"), 1000L);
+ long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L);
+ StatusReporter reporter = new StatusReporter(crawler, consumersManager);
+ reporter.setSleepMillis(sleepMillis);
reporter.setStaleThresholdMillis(staleThresholdMillis);
return reporter;
}
Modified: tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml (original)
+++ tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml Tue Mar 31 01:54:40 2015
@@ -26,13 +26,13 @@
<tika-batch-config
maxAliveTimeSeconds="-1"
pauseOnEarlyTerminationMillis="10000"
- timeoutThresholdMillis="300000"
- timeoutCheckPulseMillis="1000"
- maxQueueSize="10000"
- numConsumers="5">
-
- <!-- options to allow on the commandline -->
- <commandline>
+ timeoutThresholdMillis="300000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
+
+ <!-- options to allow on the commandline -->
+ <commandline>
<option opt="c" longOpt="tika-config" hasArg="true"
description="TikaConfig file"/>
<option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,12 +72,14 @@
<option opt="timeoutThresholdMillis" hasArg="true"
description="how long to wait before determining that a consumer is stale"/>
<option opt="includeFilePat" hasArg="true"
- description="regex that specifies which files to process"/>
- <option opt="excludeFilePat" hasArg="true"
- description="regex that specifies which files to avoid processing"/>
- </commandline>
-
-
+ description="regex that specifies which files to process"/>
+ <option opt="excludeFilePat" hasArg="true"
+ description="regex that specifies which files to avoid processing"/>
+ <option opt="reporterSleepMillis" hasArg="true"
+ description="millisecond between reports by the reporter"/>
+ </commandline>
+
+
<!-- can specify inputDir="input", but the default config should not include this -->
<!-- can also specify startDir="input/someDir" to specify which child directory
to start processing -->
@@ -111,10 +113,10 @@
<!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->
<!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
<outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
- reporterStaleThresholdMillis="60000"/>
- <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="60000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file
Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml Tue Mar 31 01:54:40 2015
@@ -103,10 +103,10 @@
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
- reporterStaleThresholdMillis="500000"/>
- <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file
Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Tue Mar 31 01:54:40 2015
@@ -96,10 +96,10 @@
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
- reporterStaleThresholdMillis="500000"/>
- <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file
Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Tue Mar 31 01:54:40 2015
@@ -102,10 +102,10 @@
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
- reporterStaleThresholdMillis="500000"/>
- <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file