You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/31 17:53:35 UTC

[1/2] tika git commit: TIKA-1918: make outputSuffix optional in tika-batch

Repository: tika
Updated Branches:
  refs/heads/master c94236a83 -> 01109c8fe


TIKA-1918: make outputSuffix optional in tika-batch


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/34db9359
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/34db9359
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/34db9359

Branch: refs/heads/master
Commit: 34db93595c71745e3bccdabc39e72181c03abbbd
Parents: 9ebf066
Author: tballison <ta...@mitre.org>
Authored: Thu Mar 31 11:52:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Mar 31 11:52:27 2016 -0400

----------------------------------------------------------------------
 .../tika/cli/BatchCommandLineBuilder.java       |   7 --
 .../main/resources/tika-app-batch-config.xml    |  10 +-
 .../tika/cli/TikaCLIBatchCommandLineTest.java   |   1 -
 .../builders/BasicTikaFSConsumersBuilder.java   |  51 ++++++++-
 .../tika/batch/fs/default-tika-batch-config.xml |  50 +++++----
 .../apache/tika/batch/fs/BatchProcessTest.java  |  19 +++-
 .../tika/batch/fs/HandlerBuilderTest.java       |   4 -
 .../tika-batch-config-MockConsumersBuilder.xml  |   2 +-
 .../test/resources/tika-batch-config-broken.xml |   2 +-
 .../tika-batch-config-test-suffix-override.xml  | 112 +++++++++++++++++++
 .../test/resources/tika-batch-config-test.xml   |   2 +-
 .../tika/sax/BasicContentHandlerFactory.java    |   8 ++
 12 files changed, 222 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
index da44956..2f85546 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
@@ -160,35 +160,28 @@ class BatchCommandLineBuilder {
             map.remove("-h");
             map.remove("--html");
             map.put("-basicHandlerType", "html");
-            map.put("-outputSuffix", "html");
         } else if (map.containsKey("-x") || map.containsKey("--xml")) {
             map.remove("-x");
             map.remove("--xml");
             map.put("-basicHandlerType", "xml");
-            map.put("-outputSuffix", "xml");
         } else if (map.containsKey("-t") || map.containsKey("--text")) {
             map.remove("-t");
             map.remove("--text");
             map.put("-basicHandlerType", "text");
-            map.put("-outputSuffix", "txt");
         } else if (map.containsKey("-m") || map.containsKey("--metadata")) {
             map.remove("-m");
             map.remove("--metadata");
             map.put("-basicHandlerType", "ignore");
-            map.put("-outputSuffix", "json");
         } else if (map.containsKey("-T") || map.containsKey("--text-main")) {
             map.remove("-T");
             map.remove("--text-main");
             map.put("-basicHandlerType", "body");
-            map.put("-outputSuffix", "txt");
         }
 
         if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
             map.remove("-J");
             map.remove("--jsonRecursive");
             map.put("-recursiveParserWrapper", "true");
-            //overwrite outputSuffix
-            map.put("-outputSuffix", "json");
         }
 
         if (map.containsKey("--inputDir") || map.containsKey("-i")) {

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/resources/tika-app-batch-config.xml
----------------------------------------------------------------------
diff --git a/tika-app/src/main/resources/tika-app-batch-config.xml b/tika-app/src/main/resources/tika-app-batch-config.xml
index e2f1204..99651a1 100644
--- a/tika-app/src/main/resources/tika-app-batch-config.xml
+++ b/tika-app/src/main/resources/tika-app-batch-config.xml
@@ -124,9 +124,13 @@
                 digest="md5" digestMarkLimit="1000000"/>
         <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
-        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->
-        <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
-        <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
+        <!-- can specify custom output file suffix with:
+            suffix=".mysuffix"
+            if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
+        <!-- can specify compression with
+            compression="bzip2|gzip|zip" -->
+
+        <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
     </consumers>
 
     <!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
index 260273e..e543ccc 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
@@ -113,7 +113,6 @@ public class TikaCLIBatchCommandLineTest {
         Map<String, String> attrs = mapify(commandLine);
         assertEquals("true", attrs.get("-recursiveParserWrapper"));
         assertEquals("html", attrs.get("-basicHandlerType"));
-        assertEquals("json", attrs.get("-outputSuffix"));
         assertEquals("batch-config.xml", attrs.get("-bc"));
         assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
index b65b046..4879af4 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
@@ -41,6 +41,7 @@ import org.apache.tika.batch.fs.FSOutputStreamFactory;
 import org.apache.tika.batch.fs.FSUtil;
 import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.util.ClassLoaderUtil;
 import org.apache.tika.util.PropsUtil;
@@ -125,7 +126,9 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
         }
         ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
         ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
-        OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes);
+        OutputStreamFactory outputStreamFactory = getOutputStreamFactory(
+                outputStreamFactoryNode, runtimeAttributes,
+                contentHandlerFactory, recursiveParserWrapper);
 
         if (recursiveParserWrapper) {
             for (int i = 0; i < numConsumers; i++) {
@@ -147,7 +150,6 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
         return manager;
     }
 
-
     private ContentHandlerFactory getContentHandlerFactory(Node node, Map<String, String> runtimeAttributes) {
 
         Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
@@ -166,7 +168,10 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
         return builder.build(node, runtimeAttributes);
     }
 
-    private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes) {
+    private OutputStreamFactory getOutputStreamFactory(Node node,
+                                                       Map<String, String> runtimeAttributes,
+                                                       ContentHandlerFactory contentHandlerFactory,
+                                                       boolean useRecursiveParserWrapper) {
         Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
 
         Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
@@ -196,6 +201,17 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
             compression = FSOutputStreamFactory.COMPRESSION.ZIP;
         }
         String suffix = attrs.get("outputSuffix");
+        //suffix should not start with "."
+        if (suffix == null) {
+            StringBuilder sb = new StringBuilder();
+            if (useRecursiveParserWrapper) {
+                sb.append("json");
+            } else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
+                appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
+            }
+            appendCompression(compression, sb);
+            suffix = sb.toString();
+        }
 
         //TODO: possibly open up the different handle-existings in the future
         //but for now, lock it down to require skip.  Too dangerous otherwise
@@ -204,4 +220,33 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
                 compression, suffix);
     }
 
+    private void appendCompression(FSOutputStreamFactory.COMPRESSION compression, StringBuilder sb) {
+        switch (compression) {
+            case NONE:
+                break;
+            case ZIP:
+                sb.append(".zip");
+                break;
+            case BZIP2:
+                sb.append(".bz2");
+                break;
+            case GZIP:
+                sb.append(".gz");
+                break;
+        }
+    }
+
+    private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, StringBuilder sb) {
+        switch (type) {
+            case XML:
+                sb.append("xml");
+                break;
+            case HTML:
+                sb.append("html");
+                break;
+            default :
+                sb.append("txt");
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
index 394c458..1b71152 100644
--- a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
+++ b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
@@ -26,13 +26,13 @@
 <tika-batch-config
         maxAliveTimeSeconds="-1"
         pauseOnEarlyTerminationMillis="10000"
-        timeoutThresholdMillis="300000"
-        timeoutCheckPulseMillis="1000"
-        maxQueueSize="10000"
-        numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
-
-    <!-- options to allow on the commandline -->
-    <commandline>
+        timeoutThresholdMillis="300000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
+
+    <!-- options to allow on the commandline -->
+    <commandline>
         <option opt="c" longOpt="tika-config" hasArg="true"
                 description="TikaConfig file"/>
         <option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,14 +72,14 @@
         <option opt="timeoutThresholdMillis" hasArg="true"
                 description="how long to wait before determining that a consumer is stale"/>
         <option opt="includeFilePat" hasArg="true"
-                description="regex that specifies which files to process"/>
-        <option opt="excludeFilePat" hasArg="true"
-                description="regex that specifies which files to avoid processing"/>
-        <option opt="reporterSleepMillis" hasArg="true"
-                description="millisecond between reports by the reporter"/>
-    </commandline>
-
-
+                description="regex that specifies which files to process"/>
+        <option opt="excludeFilePat" hasArg="true"
+                description="regex that specifies which files to avoid processing"/>
+        <option opt="reporterSleepMillis" hasArg="true"
+                description="millisecond between reports by the reporter"/>
+    </commandline>
+
+
     <!-- can specify inputDir="input", but the default config should not include this -->
     <!-- can also specify startDir="input/someDir" to specify which child directory
          to start processing -->
@@ -116,12 +116,16 @@
                 parseRecursively="true"/>
         <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
-        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->        <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
-        <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
-    </consumers>
-
-    <!-- reporter and interrupter are optional -->
-    <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
-              reporterStaleThresholdMillis="60000"/>
-    <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+        <!-- can specify custom output file suffix with:
+            suffix=".mysuffix"
+            if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
+        <!-- can specify compression with
+            compression="bzip2|gzip|zip" -->
+        <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
+    </consumers>
+
+    <!-- reporter and interrupter are optional -->
+    <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="60000"/>
+    <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
index 8cea0b3..d623afb 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
@@ -276,7 +276,6 @@ public class BatchProcessTest extends FSBatchTestBase {
                 Paths.get(this.getClass().getResource("/testFileList.txt").toURI()).toString());
         args.put("recursiveParserWrapper", "true");
         args.put("basicHandlerType", "text");
-        args.put("outputSuffix", "json");
         BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, "/tika-batch-config-MockConsumersBuilder.xml");
         ex.execute();
         Path test1 = outputDir.resolve("test1.xml.json");
@@ -302,7 +301,6 @@ public class BatchProcessTest extends FSBatchTestBase {
         args.put("numConsumers", "1");
         args.put("recursiveParserWrapper", "true");
         args.put("basicHandlerType", "text");
-        args.put("outputSuffix", "json");
 
         BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
                 "/tika-batch-config-MockConsumersBuilder.xml",
@@ -312,6 +310,23 @@ public class BatchProcessTest extends FSBatchTestBase {
         assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString());
     }
 
+    @Test
+    public void testOverrideOutputSuffix() throws Exception {
+        Path outputDir = getNewOutputDir("outputSuffixTest");
+
+        Map<String, String> args = getDefaultArgs("basic", outputDir);
+        args.put("numConsumers", "1");
+        args.put("recursiveParserWrapper", "true");
+        args.put("basicHandlerType", "text");
+
+        BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
+                "/tika-batch-config-test-suffix-override.xml",
+                "/log4j-on.properties");
+        ex.execute();
+        Path targ = outputDir.resolve("test0.xml.mysuffix");
+        assertTrue(Files.isRegularFile(targ));
+    }
+
     private class BatchProcessTestExecutor {
         private final Map<String, String> args;
         private final String configPath;

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
index d8aecad..6e3648a 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
@@ -36,7 +36,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
         Path outputDir = getNewOutputDir("handler-xml-");
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "xml");
-        args.put("outputSuffix", "xml");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
         ParallelFileProcessingResult result = run(runner);
@@ -54,7 +53,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "html");
-        args.put("outputSuffix", "html");
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
         ParallelFileProcessingResult result = run(runner);
         Path outputFile = outputDir.resolve("test0.xml.html");
@@ -70,7 +68,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "txt");
-        args.put("outputSuffix", "txt");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
         ParallelFileProcessingResult result = run(runner);
@@ -105,7 +102,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "txt");
-        args.put("outputSuffix", "json");
         args.put("recursiveParserWrapper", "true");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
index a2915cf..8da44be 100644
--- a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
@@ -103,7 +103,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
 		<outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
 	</consumers>
 	
 	<!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-broken.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-broken.xml b/tika-batch/src/test/resources/tika-batch-config-broken.xml
index 1d599b4..5b8490e 100644
--- a/tika-batch/src/test/resources/tika-batch-config-broken.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-broken.xml
@@ -97,7 +97,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
 		<outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
 	</consumers>
 	
 	<!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
new file mode 100644
index 0000000..911398f
--- /dev/null
+++ b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+    The configuration file will likely change and be backward incompatible
+    with new versions of Tika.  Please stay tuned.
+    -->
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutThresholdMillis="3000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="3">
+    <!-- options to allow on the commandline -->
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <!-- We needed sorted for testing.  We added random for performance.
+             Where crawling a directory is slow, it might be beneficial to
+             go randomly so that the parsers are triggered earlier.  The
+             default is operating system's choice ("os") which means whatever order
+             the os returns files in .listFiles(). -->
+        <option opt="crawlOrder" hasArg="true"
+                description="how does the crawler sort the directories and files:
+                                (random|sorted|os)"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="minFileSizeBytes" hasArg="true"
+                description="minimum file size to process; do not process files smaller than this"/>
+        <option opt="maxFileSizeBytes" hasArg="true"
+                description="maximum file size to process; do not process files larger than this"/>
+        <option opt="maxQueueSize" hasArg="true"
+                description="maximum queue size for FileResources"/>
+        <option opt="fileList" hasArg="true"
+                description="file that contains a list of files (relative to inputDir) to process"/>
+        <option opt="fileListEncoding" hasArg="true"
+                description="encoding for fileList"/>
+        <option opt="inputDir" hasArg="true"
+                description="root directory for the files to be processed"
+                required="true"/>
+        <option opt="startDir" hasArg="true"
+                description="directory (under inputDir) at which to start crawling"/>
+        <option opt="outputDir" hasArg="true"
+                description="output directory"
+                required="true"/>
+        <option opt="recursiveParserWrapper"
+                description="use the RecursiveParserWrapper or not (default = false)"/>
+        <option opt="handleExisting" hasArg="true"
+                description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+        <option opt="basicHandlerType" hasArg="true"
+                description="what type of content handler: xml, text, html, body"/>
+        <option opt="outputSuffix" hasArg="true"
+                description="suffix to add to the end of the output file name"/>
+        <option opt="timeoutThresholdMillis" hasArg="true"
+                description="how long to wait before determining that a consumer should be timed out"/>
+        <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+                description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+        <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+        <option opt="maxAliveTimeSeconds" hasArg="true"
+                description="how long should this process run in seconds."/>
+    </commandline>
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+	<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+        crawlOrder="sorted"
+        maxConsecWaitMillis="5000"
+        maxFilesToAdd="-1"
+		maxFilesToConsider="-1" 
+		includeFilePat=""
+		excludeFilePat=""
+		maxFileSizeBytes="-1"
+        />
+<!--        inputDir="tika-batch/src/test/resources/test-input" -->
+
+	<consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false" consumersManagerMaxMillis="120000">
+        <parser builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+                class="org.apache.tika.parser.mock.MockParserFactory"
+                parseRecursively="true"/>
+		<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+                        basicHandlerType="xml" writeLimit="-1"/>
+
+		<outputstream class="FSOutputStreamFactory"
+                encoding="UTF-8" outputSuffix="mysuffix"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-test.xml b/tika-batch/src/test/resources/tika-batch-config-test.xml
index cf71fd6..755eb58 100644
--- a/tika-batch/src/test/resources/tika-batch-config-test.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-test.xml
@@ -102,7 +102,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
 		<outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
 	</consumers>
 	
 	<!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 810b72e..c611f09 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -153,4 +153,12 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory {
         }
     }
 
+    /**
+     *
+     * @return handler type used by this factory
+     */
+    public HANDLER_TYPE getType() {
+        return type;
+    }
+
 }


[2/2] tika git commit: Merge remote-tracking branch 'origin/master'

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01109c8f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01109c8f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01109c8f

Branch: refs/heads/master
Commit: 01109c8fec3a736749b9cd2dd741fbcc936cbf4c
Parents: 34db935 c94236a
Author: tballison <ta...@mitre.org>
Authored: Thu Mar 31 11:53:28 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Mar 31 11:53:28 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml        |  15 +++++++--------
 .../java/org/apache/tika/mime/TestMimeTypes.java   |   6 ++++++
 .../src/test/resources/test-documents/testMIF.mif  | Bin 0 -> 10240 bytes
 3 files changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------