You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/08/06 14:54:16 UTC

svn commit: r1369824 - in /opennlp/sandbox/corpus-server-connector/desc: EngTokenizerAndSentdetectCPE.xml SentenceDetector.xml Tokenizer.xml TokenizerAndSentdetectAAE.xml

Author: joern
Date: Mon Aug  6 12:54:15 2012
New Revision: 1369824

URL: http://svn.apache.org/viewvc?rev=1369824&view=rev
Log:
OPENNLP-261 Added sample to perform sentence detection and tokenization via a CPE.

Added:
    opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml   (with props)
    opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml   (with props)
    opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml   (with props)
    opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml   (with props)

Added: opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml?rev=1369824&view=auto
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml (added)
+++ opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml Mon Aug  6 12:54:15 2012
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<cpeDescription>
+	<collectionReader>
+		<collectionIterator>
+			<descriptor>
+				<import location="CSQueueCollectionReader.xml" />
+			</descriptor>
+			<configurationParameterSettings>
+				<nameValuePair>
+					<name>ServerAddress</name>
+					<value>
+						<string>http://localhost:8080/rest</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>CorpusName</name>
+					<value>
+						<string>enwikinews</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>SearchQuery</name>
+					<value>
+						<string>*:*</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>QueueName</name>
+					<value>
+						<string>enwikinewsnlpqueue</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>IdFSTypeName</name>
+					<value>
+						<string>org.apache.opennlp.annotations.ArticleId</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>IdFeatureName</name>
+					<value>
+						<string>id</string>
+					</value>
+				</nameValuePair>
+			</configurationParameterSettings>
+		</collectionIterator>
+	</collectionReader>
+	<casProcessors casPoolSize="2" processingUnitThreadCount="1">
+		<casProcessor deployment="integrated" name="TokenizerAndSentdetectAAE">
+			<descriptor>
+				<import location="TokenizerAndSentdetectAAE.xml" />
+			</descriptor>
+			<deploymentParameters />
+			<filter></filter>
+			<errorHandling>
+				<errorRateThreshold action="terminate" value="100/1000" />
+				<maxConsecutiveRestarts action="terminate"
+					value="30" />
+				<timeout max="100000" />
+			</errorHandling>
+			<checkpoint batch="10000" />
+		</casProcessor>
+		<casProcessor deployment="integrated" name="CSWriter">
+			<descriptor>
+				<import location="CSCasWriter.xml" />
+			</descriptor>
+			<configurationParameterSettings>
+				<nameValuePair>
+					<name>ServerAddress</name>
+					<value>
+						<string>http://localhost:8080/rest</string>
+					</value>
+				</nameValuePair>
+				<nameValuePair>
+					<name>CorpusName</name>
+					<value>
+						<string>enwikinews</string>
+					</value>
+				</nameValuePair>
+
+				<nameValuePair>
+					<name>IdFSTypeName</name>
+					<value>
+						<string>org.apache.opennlp.annotations.ArticleId</string>
+					</value>
+				</nameValuePair>
+
+				<nameValuePair>
+					<name>IdFeatureName</name>
+					<value>
+						<string>id</string>
+					</value>
+				</nameValuePair>
+			</configurationParameterSettings>
+			<deploymentParameters />
+			<filter></filter>
+			<errorHandling>
+				<errorRateThreshold action="terminate" value="100/1000" />
+				<maxConsecutiveRestarts action="terminate"
+					value="30" />
+				<timeout max="100000" />
+			</errorHandling>
+			<checkpoint batch="10000" />
+		</casProcessor>
+	</casProcessors>
+	<cpeConfig>
+		<numToProcess>-1</numToProcess>
+		<deployAs>immediate</deployAs>
+		<checkpoint file="" time="300000" />
+		<timerImpl></timerImpl>
+	</cpeConfig>
+</cpeDescription>

Propchange: opennlp/sandbox/corpus-server-connector/desc/EngTokenizerAndSentdetectCPE.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml?rev=1369824&view=auto
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml (added)
+++ opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml Mon Aug  6 12:54:15 2012
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+	<frameworkImplementation>org.apache.uima.java
+	</frameworkImplementation>
+	<primitive>true</primitive>
+	<annotatorImplementationName>opennlp.uima.sentdetect.SentenceDetector</annotatorImplementationName>
+	<analysisEngineMetaData>
+		<name>Sentence Detector</name>
+		<description></description>
+		<version>${pom.version}</version>
+		<vendor>Apache Software Foundation</vendor>
+		<configurationParameters>
+			<configurationParameter>
+				<name>opennlp.uima.SentenceType</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>true</mandatory>
+			</configurationParameter>
+			<configurationParameter>
+				<name>opennlp.uima.ContainerType</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>
+		</configurationParameters>
+		
+		<configurationParameterSettings>
+			<nameValuePair>
+				<name>opennlp.uima.SentenceType</name>
+				<value>
+					<string>org.apache.opennlp.annotations.Sentence</string>
+				</value>
+			</nameValuePair>
+		</configurationParameterSettings>
+		
+		<typeSystemDescription>
+			<imports>
+				<import location="TypeSystem.xml" />
+			</imports>
+		</typeSystemDescription>
+		
+		<capabilities>
+			<capability>
+				<inputs />
+				<outputs />
+				<languagesSupported>
+					<language>en</language>
+				</languagesSupported>
+			</capability>
+		</capabilities>
+	</analysisEngineMetaData>
+
+	<externalResourceDependencies>
+		<externalResourceDependency>
+			<key>opennlp.uima.ModelName</key>
+			<interfaceName>opennlp.uima.sentdetect.SentenceModelResource</interfaceName>
+		</externalResourceDependency>
+	</externalResourceDependencies>
+</analysisEngineDescription>

Propchange: opennlp/sandbox/corpus-server-connector/desc/SentenceDetector.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml?rev=1369824&view=auto
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml (added)
+++ opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml Mon Aug  6 12:54:15 2012
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+	<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+	<primitive>true</primitive>
+	<annotatorImplementationName>opennlp.uima.tokenize.Tokenizer</annotatorImplementationName>
+	<analysisEngineMetaData>
+		<name>Tokenizer</name>
+		<description></description>
+		<version>${pom.version}</version>
+		<vendor>Apache Software Foundation</vendor>
+		<configurationParameters>
+			<configurationParameter>
+				<name>opennlp.uima.SentenceType</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>true</mandatory>
+			</configurationParameter>
+			
+			<configurationParameter>
+				<name>opennlp.uima.TokenType</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>true</mandatory>
+			</configurationParameter>
+			
+			<configurationParameter>
+				<name>opennlp.uima.tokenizer.IsAlphaNumericOptimization</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>
+		</configurationParameters>
+		<configurationParameterSettings>
+			<nameValuePair>
+				<name>opennlp.uima.TokenType</name>
+				<value>
+					<string>org.apache.opennlp.annotations.Token</string>
+				</value>
+			</nameValuePair>
+			<nameValuePair>
+				<name>opennlp.uima.SentenceType</name>
+				<value>
+					<string>org.apache.opennlp.annotations.Sentence</string>
+				</value>
+			</nameValuePair>
+		</configurationParameterSettings>
+		
+		<typeSystemDescription>
+			<imports>
+				<import location="TypeSystem.xml" />
+			</imports>
+		</typeSystemDescription>
+		
+		<capabilities>
+			<capability>
+				<inputs />
+				<outputs />
+				<languagesSupported>
+					<language>en</language>
+				</languagesSupported>
+			</capability>
+		</capabilities>
+		<operationalProperties>
+			<modifiesCas>true</modifiesCas>
+			<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+		</operationalProperties>
+	</analysisEngineMetaData>
+	
+	<externalResourceDependencies>
+		<externalResourceDependency>
+			<key>opennlp.uima.ModelName</key>
+			<interfaceName>opennlp.uima.tokenize.TokenizerModelResource</interfaceName>
+		</externalResourceDependency>
+	</externalResourceDependencies>
+</analysisEngineDescription>

Propchange: opennlp/sandbox/corpus-server-connector/desc/Tokenizer.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml?rev=1369824&view=auto
==============================================================================
--- opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml (added)
+++ opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml Mon Aug  6 12:54:15 2012
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+	<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+	<primitive>false</primitive>
+
+	<delegateAnalysisEngineSpecifiers>
+		<delegateAnalysisEngine key="SentenceDetector">
+			<import location="SentenceDetector.xml" />
+		</delegateAnalysisEngine>
+
+		<delegateAnalysisEngine key="Tokenizer">
+			<import location="Tokenizer.xml" />
+		</delegateAnalysisEngine>
+	</delegateAnalysisEngineSpecifiers>
+
+	<analysisEngineMetaData>
+		<name>TokenizerAndSentdetectAAE</name>
+		<description />
+		<version>${pom.version}</version>
+		<vendor>Apache Software Foundation</vendor>
+		<configurationParameters />
+		<configurationParameterSettings />
+		<flowConstraints>
+			<fixedFlow>
+				<node>SentenceDetector</node>
+				<node>Tokenizer</node>
+			</fixedFlow>
+		</flowConstraints>
+		<capabilities>
+			<capability>
+				<inputs />
+				<outputs />
+				<languagesSupported>
+					<language>en</language>
+				</languagesSupported>
+			</capability>
+		</capabilities>
+		<operationalProperties>
+			<modifiesCas>true</modifiesCas>
+			<multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+			<outputsNewCASes>false</outputsNewCASes>
+		</operationalProperties>
+	</analysisEngineMetaData>
+
+	<resourceManagerConfiguration>
+
+		<externalResources>
+			<externalResource>
+				<name>SentenceModel</name>
+				<fileResourceSpecifier>
+					<fileUrl>file:en-sent.bin</fileUrl>
+				</fileResourceSpecifier>
+				<implementationName>opennlp.uima.sentdetect.SentenceModelResourceImpl</implementationName>
+			</externalResource>
+
+			<externalResource>
+				<name>TokenModel</name>
+				<fileResourceSpecifier>
+					<fileUrl>file:en-token.bin</fileUrl>
+				</fileResourceSpecifier>
+				<implementationName>opennlp.uima.tokenize.TokenizerModelResourceImpl</implementationName>
+			</externalResource>
+		</externalResources>
+
+		<externalResourceBindings>
+			<externalResourceBinding>
+				<key>SentenceDetector/opennlp.uima.ModelName</key>
+				<resourceName>SentenceModel</resourceName>
+			</externalResourceBinding>
+			<externalResourceBinding>
+				<key>Tokenizer/opennlp.uima.ModelName</key>
+				<resourceName>TokenModel</resourceName>
+			</externalResourceBinding>
+		</externalResourceBindings>
+
+	</resourceManagerConfiguration>
+</analysisEngineDescription>
\ No newline at end of file

Propchange: opennlp/sandbox/corpus-server-connector/desc/TokenizerAndSentdetectAAE.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain