You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/03/31 03:54:41 UTC

svn commit: r1670237 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/test/java/org/apache/tika/cli/ tika-batch/src/main/java/org/apache/tika/batch/builders/ tika-batch/src/main/resources/org/apache/tika/batch/fs/ tika-batch/s...

Author: tallison
Date: Tue Mar 31 01:54:40 2015
New Revision: 1670237

URL: http://svn.apache.org/r1670237
Log:
TIKA-1330: add integration tests to TikaCLITest

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java
    tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
    tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
    tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
    tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Mar 31 01:54:40 2015
@@ -125,7 +125,7 @@ public class TikaCLI {
             String[] batchArgs = BatchCommandLineBuilder.build(args);
             BatchProcessDriverCLI batchDriver = new BatchProcessDriverCLI(batchArgs);
             batchDriver.execute();
-            System.exit(0);
+            return;
         }
 
         BasicConfigurator.configure(

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Tue Mar 31 01:54:40 2015
@@ -16,16 +16,26 @@
  */
 package org.apache.tika.cli;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
 import java.io.PrintStream;
+import java.io.Reader;
 import java.net.URI;
+import java.util.List;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -378,4 +388,96 @@ public class TikaCLITest {
         assertTrue(content.contains("\\n\\nembed_0"));
     }
 
+    @Test
+    public void testSimplestBatchIntegration() throws Exception {
+        File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+        tempDir.delete();
+        tempDir.mkdir();
+        ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+        PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+        OutputStream os = System.out;
+        System.setOut(writer);
+        try {
+            String[] params = {escape(testDataFile.getAbsolutePath()),
+                    escape(tempDir.getAbsolutePath())};
+            TikaCLI.main(params);
+
+            StringBuffer allFiles = new StringBuffer();
+            assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
+            assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
+        } finally {
+            //reset in case something went horribly wrong
+            System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+            FileUtils.deleteDirectory(tempDir);
+        }
+    }
+
+    @Test
+    public void testBasicBatchIntegration() throws Exception {
+        File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+        tempDir.delete();
+        tempDir.mkdir();
+        ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+        PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+        OutputStream os = System.out;
+        System.setOut(writer);
+        try {
+            String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+                    "-o", escape(tempDir.getAbsolutePath()),
+                    "-numConsumers", "2",
+                    "-reporterSleepMillis", "100"};//report often to make sure
+            TikaCLI.main(params);
+
+            StringBuffer allFiles = new StringBuffer();
+            assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
+            assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
+            String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8);
+
+            assertEquals(-1, sysOutString.indexOf("There are 3 file processors still active"));
+            assertTrue(sysOutString.indexOf("There are 2 file processors") > -1);
+        } finally {
+            //reset in case something went horribly wrong
+            System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+            FileUtils.deleteDirectory(tempDir);
+        }
+    }
+
+    @Test
+    public void testJsonRecursiveBatchIntegration() throws Exception {
+        File tempDir = File.createTempFile("tika-cli-test-batch-", "");
+        tempDir.delete();
+        tempDir.mkdir();
+        ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
+        PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
+        OutputStream os = System.out;
+        System.setOut(writer);
+        Reader reader = null;
+        try {
+            String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+                    "-o", escape(tempDir.getAbsolutePath()),
+                    "-numConsumers", "10",
+                    "-J", //recursive Json
+                    "-t" //plain text in content
+            };
+            TikaCLI.main(params);
+            reader = new InputStreamReader(
+                    new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            assertEquals(12, metadataList.size());
+            assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
+        } finally {
+            IOUtils.closeQuietly(reader);
+            //reset in case something went horribly wrong
+            System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
+            FileUtils.deleteDirectory(tempDir);
+        }
+    }
+
+
+    public static String escape(String path) {
+        if (path.indexOf(' ') > -1){
+            return '"'+path+'"';
+        }
+        return path;
+    }
 }

Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java Tue Mar 31 01:54:40 2015
@@ -30,13 +30,13 @@ public class SimpleLogReporterBuilder im
 
     @Override
     public StatusReporter build(FileResourceCrawler crawler, ConsumersManager consumersManager,
-                                Node n, Map<String, String> commandlineArguments) {
-
-        Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments);
-        long sleepMillis = PropsUtil.getLong(attributes.get("sleepMillis"), 1000L);
-        long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L);
-        StatusReporter reporter = new StatusReporter(crawler, consumersManager);
-        reporter.setSleepMillis(sleepMillis);
+                                Node n, Map<String, String> commandlineArguments) {
+
+        Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments);
+        long sleepMillis = PropsUtil.getLong(attributes.get("reporterSleepMillis"), 1000L);
+        long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L);
+        StatusReporter reporter = new StatusReporter(crawler, consumersManager);
+        reporter.setSleepMillis(sleepMillis);
         reporter.setStaleThresholdMillis(staleThresholdMillis);
         return reporter;
     }

Modified: tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml (original)
+++ tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml Tue Mar 31 01:54:40 2015
@@ -26,13 +26,13 @@
 <tika-batch-config
         maxAliveTimeSeconds="-1"
         pauseOnEarlyTerminationMillis="10000"
-        timeoutThresholdMillis="300000"
-        timeoutCheckPulseMillis="1000"
-        maxQueueSize="10000"
-        numConsumers="5">
-
-    <!-- options to allow on the commandline -->
-    <commandline>
+        timeoutThresholdMillis="300000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
+
+    <!-- options to allow on the commandline -->
+    <commandline>
         <option opt="c" longOpt="tika-config" hasArg="true"
                 description="TikaConfig file"/>
         <option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,12 +72,14 @@
         <option opt="timeoutThresholdMillis" hasArg="true"
                 description="how long to wait before determining that a consumer is stale"/>
         <option opt="includeFilePat" hasArg="true"
-                description="regex that specifies which files to process"/>
-        <option opt="excludeFilePat" hasArg="true"
-                description="regex that specifies which files to avoid processing"/>
-    </commandline>
-
-
+                description="regex that specifies which files to process"/>
+        <option opt="excludeFilePat" hasArg="true"
+                description="regex that specifies which files to avoid processing"/>
+        <option opt="reporterSleepMillis" hasArg="true"
+                description="millisecond between reports by the reporter"/>
+    </commandline>
+
+
     <!-- can specify inputDir="input", but the default config should not include this -->
     <!-- can also specify startDir="input/someDir" to specify which child directory
          to start processing -->
@@ -111,10 +113,10 @@
         <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->
         <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
         <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
-    </consumers>
-
-    <!-- reporter and interrupter are optional -->
-    <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
-              reporterStaleThresholdMillis="60000"/>
-    <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+    </consumers>
+
+    <!-- reporter and interrupter are optional -->
+    <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="60000"/>
+    <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file

Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml Tue Mar 31 01:54:40 2015
@@ -103,10 +103,10 @@
 
 		<outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
-	</consumers>
-	
-	<!-- reporter and interrupter are optional -->
-	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
-              reporterStaleThresholdMillis="500000"/>
-	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file

Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Tue Mar 31 01:54:40 2015
@@ -96,10 +96,10 @@
 
 		<outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
-	</consumers>
-	
-	<!-- reporter and interrupter are optional -->
-	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
-              reporterStaleThresholdMillis="500000"/>
-	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file

Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1670237&r1=1670236&r2=1670237&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Tue Mar 31 01:54:40 2015
@@ -102,10 +102,10 @@
 
 		<outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
-	</consumers>
-	
-	<!-- reporter and interrupter are optional -->
-	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
-              reporterStaleThresholdMillis="500000"/>
-	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file