You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/31 17:53:35 UTC
[1/2] tika git commit: TIKA-1918: make outputSuffix optional in
tika-batch
Repository: tika
Updated Branches:
refs/heads/master c94236a83 -> 01109c8fe
TIKA-1918: make outputSuffix optional in tika-batch
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/34db9359
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/34db9359
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/34db9359
Branch: refs/heads/master
Commit: 34db93595c71745e3bccdabc39e72181c03abbbd
Parents: 9ebf066
Author: tballison <ta...@mitre.org>
Authored: Thu Mar 31 11:52:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Mar 31 11:52:27 2016 -0400
----------------------------------------------------------------------
.../tika/cli/BatchCommandLineBuilder.java | 7 --
.../main/resources/tika-app-batch-config.xml | 10 +-
.../tika/cli/TikaCLIBatchCommandLineTest.java | 1 -
.../builders/BasicTikaFSConsumersBuilder.java | 51 ++++++++-
.../tika/batch/fs/default-tika-batch-config.xml | 50 +++++----
.../apache/tika/batch/fs/BatchProcessTest.java | 19 +++-
.../tika/batch/fs/HandlerBuilderTest.java | 4 -
.../tika-batch-config-MockConsumersBuilder.xml | 2 +-
.../test/resources/tika-batch-config-broken.xml | 2 +-
.../tika-batch-config-test-suffix-override.xml | 112 +++++++++++++++++++
.../test/resources/tika-batch-config-test.xml | 2 +-
.../tika/sax/BasicContentHandlerFactory.java | 8 ++
12 files changed, 222 insertions(+), 46 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
index da44956..2f85546 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
@@ -160,35 +160,28 @@ class BatchCommandLineBuilder {
map.remove("-h");
map.remove("--html");
map.put("-basicHandlerType", "html");
- map.put("-outputSuffix", "html");
} else if (map.containsKey("-x") || map.containsKey("--xml")) {
map.remove("-x");
map.remove("--xml");
map.put("-basicHandlerType", "xml");
- map.put("-outputSuffix", "xml");
} else if (map.containsKey("-t") || map.containsKey("--text")) {
map.remove("-t");
map.remove("--text");
map.put("-basicHandlerType", "text");
- map.put("-outputSuffix", "txt");
} else if (map.containsKey("-m") || map.containsKey("--metadata")) {
map.remove("-m");
map.remove("--metadata");
map.put("-basicHandlerType", "ignore");
- map.put("-outputSuffix", "json");
} else if (map.containsKey("-T") || map.containsKey("--text-main")) {
map.remove("-T");
map.remove("--text-main");
map.put("-basicHandlerType", "body");
- map.put("-outputSuffix", "txt");
}
if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
map.remove("-J");
map.remove("--jsonRecursive");
map.put("-recursiveParserWrapper", "true");
- //overwrite outputSuffix
- map.put("-outputSuffix", "json");
}
if (map.containsKey("--inputDir") || map.containsKey("-i")) {
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/resources/tika-app-batch-config.xml
----------------------------------------------------------------------
diff --git a/tika-app/src/main/resources/tika-app-batch-config.xml b/tika-app/src/main/resources/tika-app-batch-config.xml
index e2f1204..99651a1 100644
--- a/tika-app/src/main/resources/tika-app-batch-config.xml
+++ b/tika-app/src/main/resources/tika-app-batch-config.xml
@@ -124,9 +124,13 @@
digest="md5" digestMarkLimit="1000000"/>
<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
- <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" -->
- <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
- <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
+ <!-- can specify custom output file suffix with:
+ suffix=".mysuffix"
+ if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
+ <!-- can specify compression with
+ compression="bzip2|gzip|zip" -->
+
+ <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
</consumers>
<!-- reporter and interrupter are optional -->
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
index 260273e..e543ccc 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
@@ -113,7 +113,6 @@ public class TikaCLIBatchCommandLineTest {
Map<String, String> attrs = mapify(commandLine);
assertEquals("true", attrs.get("-recursiveParserWrapper"));
assertEquals("html", attrs.get("-basicHandlerType"));
- assertEquals("json", attrs.get("-outputSuffix"));
assertEquals("batch-config.xml", attrs.get("-bc"));
assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
index b65b046..4879af4 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
@@ -41,6 +41,7 @@ import org.apache.tika.batch.fs.FSOutputStreamFactory;
import org.apache.tika.batch.fs.FSUtil;
import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.PropsUtil;
@@ -125,7 +126,9 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
}
ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
- OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes);
+ OutputStreamFactory outputStreamFactory = getOutputStreamFactory(
+ outputStreamFactoryNode, runtimeAttributes,
+ contentHandlerFactory, recursiveParserWrapper);
if (recursiveParserWrapper) {
for (int i = 0; i < numConsumers; i++) {
@@ -147,7 +150,6 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
return manager;
}
-
private ContentHandlerFactory getContentHandlerFactory(Node node, Map<String, String> runtimeAttributes) {
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
@@ -166,7 +168,10 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
return builder.build(node, runtimeAttributes);
}
- private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes) {
+ private OutputStreamFactory getOutputStreamFactory(Node node,
+ Map<String, String> runtimeAttributes,
+ ContentHandlerFactory contentHandlerFactory,
+ boolean useRecursiveParserWrapper) {
Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
@@ -196,6 +201,17 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
compression = FSOutputStreamFactory.COMPRESSION.ZIP;
}
String suffix = attrs.get("outputSuffix");
+ //suffix should not start with "."
+ if (suffix == null) {
+ StringBuilder sb = new StringBuilder();
+ if (useRecursiveParserWrapper) {
+ sb.append("json");
+ } else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
+ appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
+ }
+ appendCompression(compression, sb);
+ suffix = sb.toString();
+ }
//TODO: possibly open up the different handle-existings in the future
//but for now, lock it down to require skip. Too dangerous otherwise
@@ -204,4 +220,33 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
compression, suffix);
}
+ private void appendCompression(FSOutputStreamFactory.COMPRESSION compression, StringBuilder sb) {
+ switch (compression) {
+ case NONE:
+ break;
+ case ZIP:
+ sb.append(".zip");
+ break;
+ case BZIP2:
+ sb.append(".bz2");
+ break;
+ case GZIP:
+ sb.append(".gz");
+ break;
+ }
+ }
+
+ private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, StringBuilder sb) {
+ switch (type) {
+ case XML:
+ sb.append("xml");
+ break;
+ case HTML:
+ sb.append("html");
+ break;
+ default :
+ sb.append("txt");
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
index 394c458..1b71152 100644
--- a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
+++ b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
@@ -26,13 +26,13 @@
<tika-batch-config
maxAliveTimeSeconds="-1"
pauseOnEarlyTerminationMillis="10000"
- timeoutThresholdMillis="300000"
- timeoutCheckPulseMillis="1000"
- maxQueueSize="10000"
- numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
-
- <!-- options to allow on the commandline -->
- <commandline>
+ timeoutThresholdMillis="300000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
+
+ <!-- options to allow on the commandline -->
+ <commandline>
<option opt="c" longOpt="tika-config" hasArg="true"
description="TikaConfig file"/>
<option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,14 +72,14 @@
<option opt="timeoutThresholdMillis" hasArg="true"
description="how long to wait before determining that a consumer is stale"/>
<option opt="includeFilePat" hasArg="true"
- description="regex that specifies which files to process"/>
- <option opt="excludeFilePat" hasArg="true"
- description="regex that specifies which files to avoid processing"/>
- <option opt="reporterSleepMillis" hasArg="true"
- description="millisecond between reports by the reporter"/>
- </commandline>
-
-
+ description="regex that specifies which files to process"/>
+ <option opt="excludeFilePat" hasArg="true"
+ description="regex that specifies which files to avoid processing"/>
+ <option opt="reporterSleepMillis" hasArg="true"
+ description="millisecond between reports by the reporter"/>
+ </commandline>
+
+
<!-- can specify inputDir="input", but the default config should not include this -->
<!-- can also specify startDir="input/someDir" to specify which child directory
to start processing -->
@@ -116,12 +116,16 @@
parseRecursively="true"/>
<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
- <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" --> <!-- can include e.g. outputDir="output", but we don't want to include this in the default! -->
- <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/>
- </consumers>
-
- <!-- reporter and interrupter are optional -->
- <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
- reporterStaleThresholdMillis="60000"/>
- <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+ <!-- can specify custom output file suffix with:
+ suffix=".mysuffix"
+ if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
+ <!-- can specify compression with
+ compression="bzip2|gzip|zip" -->
+ <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="60000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
index 8cea0b3..d623afb 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
@@ -276,7 +276,6 @@ public class BatchProcessTest extends FSBatchTestBase {
Paths.get(this.getClass().getResource("/testFileList.txt").toURI()).toString());
args.put("recursiveParserWrapper", "true");
args.put("basicHandlerType", "text");
- args.put("outputSuffix", "json");
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, "/tika-batch-config-MockConsumersBuilder.xml");
ex.execute();
Path test1 = outputDir.resolve("test1.xml.json");
@@ -302,7 +301,6 @@ public class BatchProcessTest extends FSBatchTestBase {
args.put("numConsumers", "1");
args.put("recursiveParserWrapper", "true");
args.put("basicHandlerType", "text");
- args.put("outputSuffix", "json");
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
"/tika-batch-config-MockConsumersBuilder.xml",
@@ -312,6 +310,23 @@ public class BatchProcessTest extends FSBatchTestBase {
assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString());
}
+ @Test
+ public void testOverrideOutputSuffix() throws Exception {
+ Path outputDir = getNewOutputDir("outputSuffixTest");
+
+ Map<String, String> args = getDefaultArgs("basic", outputDir);
+ args.put("numConsumers", "1");
+ args.put("recursiveParserWrapper", "true");
+ args.put("basicHandlerType", "text");
+
+ BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
+ "/tika-batch-config-test-suffix-override.xml",
+ "/log4j-on.properties");
+ ex.execute();
+ Path targ = outputDir.resolve("test0.xml.mysuffix");
+ assertTrue(Files.isRegularFile(targ));
+ }
+
private class BatchProcessTestExecutor {
private final Map<String, String> args;
private final String configPath;
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
index d8aecad..6e3648a 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
@@ -36,7 +36,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
Path outputDir = getNewOutputDir("handler-xml-");
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "xml");
- args.put("outputSuffix", "xml");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
@@ -54,7 +53,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "html");
- args.put("outputSuffix", "html");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
Path outputFile = outputDir.resolve("test0.xml.html");
@@ -70,7 +68,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "txt");
- args.put("outputSuffix", "txt");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
ParallelFileProcessingResult result = run(runner);
@@ -105,7 +102,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
Map<String, String> args = getDefaultArgs("basic", outputDir);
args.put("basicHandlerType", "txt");
- args.put("outputSuffix", "json");
args.put("recursiveParserWrapper", "true");
BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args);
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
index a2915cf..8da44be 100644
--- a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
@@ -103,7 +103,7 @@
basicHandlerType="xml" writeLimit="-1"/>
<outputstream class="FSOutputStreamFactory"
- encoding="UTF-8" outputSuffix="xml"/>
+ encoding="UTF-8"/>
</consumers>
<!-- reporter and interrupter are optional -->
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-broken.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-broken.xml b/tika-batch/src/test/resources/tika-batch-config-broken.xml
index 1d599b4..5b8490e 100644
--- a/tika-batch/src/test/resources/tika-batch-config-broken.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-broken.xml
@@ -97,7 +97,7 @@
basicHandlerType="xml" writeLimit="-1"/>
<outputstream class="FSOutputStreamFactory"
- encoding="UTF-8" outputSuffix="xml"/>
+ encoding="UTF-8"/>
</consumers>
<!-- reporter and interrupter are optional -->
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
new file mode 100644
index 0000000..911398f
--- /dev/null
+++ b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+ The configuration file will likely change and be backward incompatible
+ with new versions of Tika. Please stay tuned.
+ -->
+<tika-batch-config
+ maxAliveTimeSeconds="-1"
+ pauseOnEarlyTerminationMillis="500"
+ timeoutThresholdMillis="3000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="3">
+ <!-- options to allow on the commandline -->
+ <commandline>
+ <option opt="c" longOpt="tika-config" hasArg="true"
+ description="TikaConfig file"/>
+ <option opt="bc" longOpt="batch-config" hasArg="true"
+ description="xml batch config file" required="true"/>
+ <!-- We needed sorted for testing. We added random for performance.
+ Where crawling a directory is slow, it might be beneficial to
+ go randomly so that the parsers are triggered earlier. The
+ default is operating system's choice ("os") which means whatever order
+ the os returns files in .listFiles(). -->
+ <option opt="crawlOrder" hasArg="true"
+ description="how does the crawler sort the directories and files:
+ (random|sorted|os)"/>
+ <option opt="numConsumers" hasArg="true"
+ description="number of fileConsumers threads"/>
+ <option opt="minFileSizeBytes" hasArg="true"
+ description="minimum file size to process; do not process files smaller than this"/>
+ <option opt="maxFileSizeBytes" hasArg="true"
+ description="maximum file size to process; do not process files larger than this"/>
+ <option opt="maxQueueSize" hasArg="true"
+ description="maximum queue size for FileResources"/>
+ <option opt="fileList" hasArg="true"
+ description="file that contains a list of files (relative to inputDir) to process"/>
+ <option opt="fileListEncoding" hasArg="true"
+ description="encoding for fileList"/>
+ <option opt="inputDir" hasArg="true"
+ description="root directory for the files to be processed"
+ required="true"/>
+ <option opt="startDir" hasArg="true"
+ description="directory (under inputDir) at which to start crawling"/>
+ <option opt="outputDir" hasArg="true"
+ description="output directory"
+ required="true"/>
+ <option opt="recursiveParserWrapper"
+ description="use the RecursiveParserWrapper or not (default = false)"/>
+ <option opt="handleExisting" hasArg="true"
+ description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+ <option opt="basicHandlerType" hasArg="true"
+ description="what type of content handler: xml, text, html, body"/>
+ <option opt="outputSuffix" hasArg="true"
+ description="suffix to add to the end of the output file name"/>
+ <option opt="timeoutThresholdMillis" hasArg="true"
+ description="how long to wait before determining that a consumer should be timed out"/>
+ <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+ description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+ <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+ <option opt="maxAliveTimeSeconds" hasArg="true"
+ description="how long should this process run in seconds."/>
+ </commandline>
+ <!--
+ Can also add startDir: this tells the crawler to start indexing a
+ child directory of the inputDir directory.
+ -->
+ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+ crawlOrder="sorted"
+ maxConsecWaitMillis="5000"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat=""
+ excludeFilePat=""
+ maxFileSizeBytes="-1"
+ />
+<!-- inputDir="tika-batch/src/test/resources/test-input" -->
+
+ <consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+ recursiveParserWrapper="false" consumersManagerMaxMillis="120000">
+ <parser builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+ class="org.apache.tika.parser.mock.MockParserFactory"
+ parseRecursively="true"/>
+ <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+ basicHandlerType="xml" writeLimit="-1"/>
+
+ <outputstream class="FSOutputStreamFactory"
+ encoding="UTF-8" outputSuffix="mysuffix"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-test.xml b/tika-batch/src/test/resources/tika-batch-config-test.xml
index cf71fd6..755eb58 100644
--- a/tika-batch/src/test/resources/tika-batch-config-test.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-test.xml
@@ -102,7 +102,7 @@
basicHandlerType="xml" writeLimit="-1"/>
<outputstream class="FSOutputStreamFactory"
- encoding="UTF-8" outputSuffix="xml"/>
+ encoding="UTF-8"/>
</consumers>
<!-- reporter and interrupter are optional -->
http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 810b72e..c611f09 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -153,4 +153,12 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory {
}
}
+ /**
+ *
+ * @return handler type used by this factory
+ */
+ public HANDLER_TYPE getType() {
+ return type;
+ }
+
}
[2/2] tika git commit: Merge remote-tracking branch 'origin/master'
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01109c8f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01109c8f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01109c8f
Branch: refs/heads/master
Commit: 01109c8fec3a736749b9cd2dd741fbcc936cbf4c
Parents: 34db935 c94236a
Author: tballison <ta...@mitre.org>
Authored: Thu Mar 31 11:53:28 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Mar 31 11:53:28 2016 -0400
----------------------------------------------------------------------
.../org/apache/tika/mime/tika-mimetypes.xml | 15 +++++++--------
.../java/org/apache/tika/mime/TestMimeTypes.java | 6 ++++++
.../src/test/resources/test-documents/testMIF.mif | Bin 0 -> 10240 bytes
3 files changed, 13 insertions(+), 8 deletions(-)
----------------------------------------------------------------------