You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/03/23 17:09:12 UTC

svn commit: r1668673 [6/6] - in /tika/trunk: ./ tika-app/ tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/main/resources/ tika-app/src/test/java/org/apache/tika/cli/ tika-batch/ tika-batch/src/ tika-batch/src/main/ tika-batch/src/main/examples...

Added: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1668673&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Mon Mar 23 16:09:10 2015
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+    The configuration file will likely change and be backward incompatible
+    with new versions of Tika.  Please stay tuned.
+    -->
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutThresholdMillis="3000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="3"/>
+    <!-- options to allow on the commandline -->
+    <commandline/>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="randomCrawl" hasArg="false"
+                description="file crawler crawls directories randomly"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="minFileSizeBytes" hasArg="true"
+                description="minimum file size to process; do not process files smaller than this"/>
+        <option opt="maxFileSizeBytes" hasArg="true"
+                description="maximum file size to process; do not process files larger than this"/>
+        <option opt="maxQueueSize" hasArg="true"
+                description="maximum queue size for FileResources"/>
+        <option opt="fileList" hasArg="true"
+                description="file that contains a list of files (relative to inputDir) to process"/>
+        <option opt="fileListEncoding" hasArg="true"
+                description="encoding for fileList"/>
+        <option opt="inputDir" hasArg="true"
+                description="root directory for the files to be processed"
+                required="true"/>
+        <option opt="startDir" hasArg="true"
+                description="directory (under inputDir) at which to start crawling"/>
+        <option opt="outputDir" hasArg="true"
+                description="output directory"
+                required="true"/>
+        <option opt="recursiveParserWrapper"
+                description="use the RecursiveParserWrapper or not (default = false)"/>
+        <option opt="handleExisting" hasArg="true"
+                description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+        <option opt="basicHandlerType" hasArg="true"
+                description="what type of content handler: xml, text, html, body"/>
+        <option opt="outputSuffix" hasArg="true"
+                description="suffix to add to the end of the output file name"/>
+        <option opt="timeoutThresholdMillis" hasArg="true"
+                description="how long to wait before determining that a consumer should be timed out"/>
+        <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+                description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+        <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+        <option opt="maxAliveTimeSeconds" hasArg="true"
+                description="how long should this process run in seconds."/>
+    </commandline>
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+	<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+        crawlOrder="sorted"
+        maxConsecWaitMillis="5000"
+        maxFilesToAdd="-1"
+		maxFilesToConsider="-1" 
+		includeFilePat=""
+		excludeFilePat=""
+		maxFileSizeBytes="-1"
+        />
+<!--        inputDir="tika-batch/src/test/resources/test-input" -->
+
+	<consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false">
+		<parser class="org.apache.tika.parser.mock.MockParserFactory" parseRecursively="true"/>
+		<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+                        basicHandlerType="xml" writeLimit="-1"/>
+
+
+		<outputstream class="FSOutputStreamFactory"
+                encoding="UTF-8" outputSuffix="xml"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

Added: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1668673&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Mon Mar 23 16:09:10 2015
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+    The configuration file will likely change and be backward incompatible
+    with new versions of Tika.  Please stay tuned.
+    -->
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutThresholdMillis="3000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="3">
+    <!-- options to allow on the commandline -->
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <!-- We needed sorted for testing.  We added random for performance.
+             Where crawling a directory is slow, it might be beneficial to
+             go randomly so that the parsers are triggered earlier.  The
+             default is operating system's choice ("os") which means whatever order
+             the os returns files in .listFiles(). -->
+        <option opt="crawlOrder" hasArg="true"
+                description="how does the crawler sort the directories and files:
+                                (random|sorted|os)"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="minFileSizeBytes" hasArg="true"
+                description="minimum file size to process; do not process files smaller than this"/>
+        <option opt="maxFileSizeBytes" hasArg="true"
+                description="maximum file size to process; do not process files larger than this"/>
+        <option opt="maxQueueSize" hasArg="true"
+                description="maximum queue size for FileResources"/>
+        <option opt="fileList" hasArg="true"
+                description="file that contains a list of files (relative to inputDir) to process"/>
+        <option opt="fileListEncoding" hasArg="true"
+                description="encoding for fileList"/>
+        <option opt="inputDir" hasArg="true"
+                description="root directory for the files to be processed"
+                required="true"/>
+        <option opt="startDir" hasArg="true"
+                description="directory (under inputDir) at which to start crawling"/>
+        <option opt="outputDir" hasArg="true"
+                description="output directory"
+                required="true"/>
+        <option opt="recursiveParserWrapper"
+                description="use the RecursiveParserWrapper or not (default = false)"/>
+        <option opt="handleExisting" hasArg="true"
+                description="if an output file already exists, do you want to: overwrite, rename or skip"/>
+        <option opt="basicHandlerType" hasArg="true"
+                description="what type of content handler: xml, text, html, body"/>
+        <option opt="outputSuffix" hasArg="true"
+                description="suffix to add to the end of the output file name"/>
+        <option opt="timeoutThresholdMillis" hasArg="true"
+                description="how long to wait before determining that a consumer should be timed out"/>
+        <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+                description="how long to wait for parsers to finish if there is an early termination from the main loop."/>
+        <!-- in long running process, might be good to restart every hour or so to avoid memory leaks-->
+        <option opt="maxAliveTimeSeconds" hasArg="true"
+                description="how long should this process run in seconds."/>
+    </commandline>
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+	<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+        crawlOrder="sorted"
+        maxConsecWaitMillis="5000"
+        maxFilesToAdd="-1"
+		maxFilesToConsider="-1" 
+		includeFilePat=""
+		excludeFilePat=""
+		maxFileSizeBytes="-1"
+        />
+<!--        inputDir="tika-batch/src/test/resources/test-input" -->
+
+	<consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false" consumersManagerMaxMillis="120000">
+		<parser class="org.apache.tika.parser.mock.MockParserFactory" parseRecursively="true"/>
+		<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+                        basicHandlerType="xml" writeLimit="-1"/>
+
+
+		<outputstream class="FSOutputStreamFactory"
+                encoding="UTF-8" outputSuffix="xml"/>
+	</consumers>
+	
+	<!-- reporter and interrupter are optional -->
+	<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+	<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file