You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/03/23 17:09:12 UTC
svn commit: r1668673 [6/6] - in /tika/trunk: ./ tika-app/
tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/main/resources/
tika-app/src/test/java/org/apache/tika/cli/ tika-batch/ tika-batch/src/
tika-batch/src/main/ tika-batch/src/main/examples...
Added: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1668673&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Mon Mar 23 16:09:10 2015
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+ The configuration file will likely change and be backward incompatible
+ with new versions of Tika. Please stay tuned.
+ -->
+<tika-batch-config
+ maxAliveTimeSeconds="-1"
+ pauseOnEarlyTerminationMillis="500"
+ timeoutThresholdMillis="3000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="3"/>
+ <!-- options to allow on the commandline -->
+ <commandline/>
+ <option opt="c" longOpt="tika-config" hasArg="true"
+ description="TikaConfig file"/>
+ <option opt="bc" longOpt="batch-config" hasArg="true"
+ description="xml batch config file" required="true"/>
+ <option opt="randomCrawl" hasArg="false"
+ description="file crawler crawls directories randomly"/>
+ <option opt="numConsumers" hasArg="true"
+ description="number of fileConsumers threads"/>
+ <option opt="minFileSizeBytes" hasArg="true"
+ description="minimum file size to process; do not process files smaller than this"/>
+ <option opt="maxFileSizeBytes" hasArg="true"
+ description="maximum file size to process; do not process files larger than this"/>
+ <option opt="maxQueueSize" hasArg="true"
+ description="maximum queue size for FileResources"/>
+ <option opt="fileList" hasArg="true"
+ description="file that contains a list of files (relative to inputDir) to process"/>
+ <option opt="fileListEncoding" hasArg="true"
+ description="encoding for fileList"/>
+ <option opt="inputDir" hasArg="true"
+ description="root directory for the files to be processed"
+ required="true"/>
+ <option opt="startDir" hasArg="true"
+ description="directory (under inputDir) at which to start crawling"/>
+ <option opt="outputDir" hasArg="true"
+ description="output directory"
+ required="true"/>
+ <option opt="recursiveParserWrapper"
+ description="use the RecursiveParserWrapper or not (default = false)"/>
+ <option opt="handleExisting" hasArg="true"
+ description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+ <option opt="basicHandlerType" hasArg="true"
+ description="what type of content handler: xml, text, html, body"/>
+ <option opt="outputSuffix" hasArg="true"
+ description="suffix to add to the end of the output file name"/>
+ <option opt="timeoutThresholdMillis" hasArg="true"
+ description="how long to wait before determining that a consumer should be timed out"/>
+ <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+ description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+ <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+ <option opt="maxAliveTimeSeconds" hasArg="true"
+ description="how long should this process run in seconds."/>
+ </commandline>
+ <!--
+ Can also add startDir: this tells the crawler to start indexing a
+ child directory of the inputDir directory.
+ -->
+ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+ crawlOrder="sorted"
+ maxConsecWaitMillis="5000"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat=""
+ excludeFilePat=""
+ maxFileSizeBytes="-1"
+ />
+<!-- inputDir="tika-batch/src/test/resources/test-input" -->
+
+ <consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+ recursiveParserWrapper="false">
+ <parser class="org.apache.tika.parser.mock.MockParserFactory" parseRecursively="true"/>
+ <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+ basicHandlerType="xml" writeLimit="-1"/>
+
+
+ <outputstream class="FSOutputStreamFactory"
+ encoding="UTF-8" outputSuffix="xml"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file
Added: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1668673&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Mon Mar 23 16:09:10 2015
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+ The configuration file will likely change and be backward incompatible
+ with new versions of Tika. Please stay tuned.
+ -->
+<tika-batch-config
+ maxAliveTimeSeconds="-1"
+ pauseOnEarlyTerminationMillis="500"
+ timeoutThresholdMillis="3000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="3">
+ <!-- options to allow on the commandline -->
+ <commandline>
+ <option opt="c" longOpt="tika-config" hasArg="true"
+ description="TikaConfig file"/>
+ <option opt="bc" longOpt="batch-config" hasArg="true"
+ description="xml batch config file" required="true"/>
+ <!-- We needed sorted for testing. We added random for performance.
+ Where crawling a directory is slow, it might be beneficial to
+ go randomly so that the parsers are triggered earlier. The
+ default is operating system's choice ("os") which means whatever order
+ the os returns files in .listFiles(). -->
+ <option opt="crawlOrder" hasArg="true"
+ description="how does the crawler sort the directories and files:
+ (random|sorted|os)"/>
+ <option opt="numConsumers" hasArg="true"
+ description="number of fileConsumers threads"/>
+ <option opt="minFileSizeBytes" hasArg="true"
+ description="minimum file size to process; do not process files smaller than this"/>
+ <option opt="maxFileSizeBytes" hasArg="true"
+ description="maximum file size to process; do not process files larger than this"/>
+ <option opt="maxQueueSize" hasArg="true"
+ description="maximum queue size for FileResources"/>
+ <option opt="fileList" hasArg="true"
+ description="file that contains a list of files (relative to inputDir) to process"/>
+ <option opt="fileListEncoding" hasArg="true"
+ description="encoding for fileList"/>
+ <option opt="inputDir" hasArg="true"
+ description="root directory for the files to be processed"
+ required="true"/>
+ <option opt="startDir" hasArg="true"
+ description="directory (under inputDir) at which to start crawling"/>
+ <option opt="outputDir" hasArg="true"
+ description="output directory"
+ required="true"/>
+ <option opt="recursiveParserWrapper"
+ description="use the RecursiveParserWrapper or not (default = false)"/>
+ <option opt="handleExisting" hasArg="true"
+ description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+ <option opt="basicHandlerType" hasArg="true"
+ description="what type of content handler: xml, text, html, body"/>
+ <option opt="outputSuffix" hasArg="true"
+ description="suffix to add to the end of the output file name"/>
+ <option opt="timeoutThresholdMillis" hasArg="true"
+ description="how long to wait before determining that a consumer should be timed out"/>
+ <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+ description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+ <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+ <option opt="maxAliveTimeSeconds" hasArg="true"
+ description="how long should this process run in seconds."/>
+ </commandline>
+ <!--
+ Can also add startDir: this tells the crawler to start indexing a
+ child directory of the inputDir directory.
+ -->
+ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+ crawlOrder="sorted"
+ maxConsecWaitMillis="5000"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat=""
+ excludeFilePat=""
+ maxFileSizeBytes="-1"
+ />
+<!-- inputDir="tika-batch/src/test/resources/test-input" -->
+
+ <consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+ recursiveParserWrapper="false" consumersManagerMaxMillis="120000">
+ <parser class="org.apache.tika.parser.mock.MockParserFactory" parseRecursively="true"/>
+ <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+ basicHandlerType="xml" writeLimit="-1"/>
+
+
+ <outputstream class="FSOutputStreamFactory"
+ encoding="UTF-8" outputSuffix="xml"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
+ reporterStaleThresholdMillis="500000"/>
+ <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file