You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@uima.apache.org by Imran <im...@gmail.com> on 2014/12/24 11:47:54 UTC
Unable to concept map a text with apache UIMA ConceptMapper
I am trying to write a small example of Apache UIMA Concept Mapper but I
am unable to map the dictionary tokens on passed text. Please help me
with this. My current source code and settings are:
AnalysisEngineDescription conceptMapperDesc =
UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(new XMLInputSource("<PATH-TO-
File>/ConceptMapperOffsetTokenizer.xml"));
AnalysisEngine ae =
UIMAFramework.produceAnalysisEngine(conceptMapperDesc);
JCas jcas = ae.newJCas();
jcas.setDocumentText("The Big Apple is a nickname for New York City");
ae.process(jcas);
FSIterator<?> iter =
jcas.getAnnotationIndex(org.apache.uima.conceptMapper.DictTerm.type).ite
rator();
while (iter.hasNext()) {
Annotation term = (Annotation)iter.next();
System.out.println(term);
}
ConceptMapperOffsetTokenizer.xml is :
<?xml version="1.0" encoding="UTF-8"?> <!-- Licensed to the Apache
Software Foundation (ASF) under one or more contributor license
agreements. See the NOTICE file distributed with this work for
additional information regarding copyright ownership. The ASF
licenses this file to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance with the
License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an "AS IS"
BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
--> <taeDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.conceptMapper.ConceptMapper
</annotatorImplementationName> <analysisEngineMetaData>
<name>ConceptMapper</name> <description></description>
<version>1</version> <vendor></vendor>
<configurationParameters> <configurationParameter>
<name>caseMatch</name>
<description>
this parameter specifies the case folding mode:
ignoreall - fold everything to lowercase for
matching insensitive - fold only tokens with initial
caps to lowercase digitfold - fold all (and only)
tokens with a digit sensitive - perform no case
folding
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>Stemmer</name>
<description>
Name of stemmer class to use before matching. MUST
have a zero-parameter constructor! If not specified,
no stemming will be performed.
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>ResultingAnnotationName</name>
<description>
Name of the annotation type created by this TAE,
must match the typeSystemDescription entry
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>ResultingEnclosingSpanName</name>
<description>
Name of the feature in the resultingAnnotation to
contain the span that encloses it (i.e. its
sentence)
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>AttributeList</name>
<description>
List of attribute names for XML dictionary entry
record - must correspond to FeatureList
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>FeatureList</name>
<description>
List of feature names for CAS annotation - must
correspond to AttributeList
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenAnnotation</name>
<description></description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenClassFeatureName</name>
<description>
Name of feature used when doing lookups against
IncludedTokenClasses and ExcludedTokenClasses
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenTextFeatureName</name>
<description></description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>SpanFeatureStructure</name>
<description>
Type of annotation which corresponds to spans of
data for processing (e.g. a Sentence)
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>OrderIndependentLookup</name>
<description>
True if should ignore element order during lookup
(i.e., "top box" would equal "box top"). Default is
False.
</description>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenTypeFeatureName</name>
<description>
Name of feature used when doing lookups against
IncludedTokenTypes and ExcludedTokenTypes
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>IncludedTokenTypes</name>
<description>
Type of tokens to include in lookups (if not
supplied, then all types are included except those
specifically mentioned in ExcludedTokenTypes)
</description>
<type>Integer</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>ExcludedTokenTypes</name>
<description></description>
<type>Integer</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>ExcludedTokenClasses</name>
<description>
Class of tokens to exclude from lookups (if not
supplied, then all classes are excluded except those
specifically mentioned in IncludedTokenClasses,
unless IncludedTokenClasses is not supplied, in
which case none are excluded)
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>IncludedTokenClasses</name>
<description>
Class of tokens to include in lookups (if not
supplied, then all classes are included except those
specifically mentioned in ExcludedTokenClasses)
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenClassWriteBackFeatureNames</name>
<description>
names of features that should be written back to a
token, such as a POS tag
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>ResultingAnnotationMatchedTextFeature</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>PrintDictionary</name>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>SearchStrategy</name>
<description>
Can be either "SkipAnyMatch",
"SkipAnyMatchAllowOverlap" or
"ContiguousMatch" ContiguousMatch: longest
match of contiguous tokens within enclosing
span(taking into account included/excluded items).
DEFAULT strategy SkipAnyMatch: longest match of
not-necessarily contiguous tokens within enclosing
span (taking into account included/excluded items).
Subsequent lookups begin in span after complete
match. IMPLIES order-independent lookup
SkipAnyMatchAllowOverlap: longest match of
not-necessarily contiguous tokens within enclosing
span (taking into account included/excluded items).
Subsequent lookups begin in span after next token.
IMPLIES order-independent lookup
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>StopWords</name>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>FindAllMatches</name>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>MatchedTokensFeatureName</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>ReplaceCommaWithAND</name>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> <configurationParameter>
<name>TokenizerDescriptorPath</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter> <configurationParameter>
<name>LanguageID</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter> </configurationParameters>
<configurationParameterSettings> <nameValuePair>
<name>caseMatch</name>
<value>
<string>ignoreall</string>
</value> </nameValuePair>
<nameValuePair>
<name>AttributeList</name>
<value>
<array>
<string>canonical</string>
</array>
</value> </nameValuePair>
<nameValuePair>
<name>FeatureList</name>
<value>
<array>
<string>DictCanon</string>
</array>
</value> </nameValuePair>
<nameValuePair>
<name>TokenAnnotation</name>
<value>
<string>uima.tt.TokenAnnotation</string>
</value> </nameValuePair>
<nameValuePair>
<name>ResultingAnnotationName</name>
<value>
<string>
org.apache.uima.conceptMapper.DictTerm
</string>
</value> </nameValuePair>
<nameValuePair>
<name>SpanFeatureStructure</name>
<value>
<string>uima.tcas.DocumentAnnotation</string>
</value> </nameValuePair>
<nameValuePair>
<name>OrderIndependentLookup</name>
<value>
<boolean>false</boolean>
</value> </nameValuePair>
<nameValuePair>
<name>TokenClassWriteBackFeatureNames</name>
<value>
<array />
</value> </nameValuePair>
<nameValuePair>
<name>IncludedTokenClasses</name>
<value>
<array />
</value> </nameValuePair>
<nameValuePair>
<name>PrintDictionary</name>
<value>
<boolean>false</boolean>
</value> </nameValuePair>
<nameValuePair>
<name>FindAllMatches</name>
<value>
<boolean>false</boolean>
</value> </nameValuePair>
<nameValuePair>
<name>StopWords</name>
<value>
<array />
</value> </nameValuePair>
<nameValuePair>
<name>ReplaceCommaWithAND</name>
<value>
<boolean>false</boolean>
</value> </nameValuePair>
<nameValuePair>
<name>TokenizerDescriptorPath</name>
<value>
<string>
D:\SupportingProjects\ConceptMapper\desc\analysis_engine\primitive\Offse
tTokenizer.xml
</string>
</value> </nameValuePair>
<nameValuePair>
<name>ResultingEnclosingSpanName</name>
<value>
<string>enclosingSpan</string>
</value> </nameValuePair>
<nameValuePair>
<name>MatchedTokensFeatureName</name>
<value>
<string>matchedTokens</string>
</value> </nameValuePair>
<nameValuePair>
<name>ResultingAnnotationMatchedTextFeature</name>
<value>
<string>matchedText</string>
</value> </nameValuePair>
<nameValuePair>
<name>SearchStrategy</name>
<value>
<string>ContiguousMatch</string>
</value> </nameValuePair>
<nameValuePair>
<name>LanguageID</name>
<value>
<string>en</string>
</value> </nameValuePair>
</configurationParameterSettings> <typeSystemDescription>
<imports>
<import name="org.apache.uima.conceptMapper.DictTerm" />
<import
name="org.apache.uima.conceptMapper.support.tokenizer.TokenAnnotation"
/> </imports> <types>
<typeDescription>
<name>uima.tt.TokenAnnotation</name>
<description></description>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>SemClass</name>
<description>
semantic class of token
</description>
<rangeTypeName>
uima.cas.String
</rangeTypeName>
</featureDescription>
<featureDescription>
<name>POS</name>
<description>
Part of SPeech of term to which this
token is a part
</description>
<rangeTypeName>
uima.cas.String
</rangeTypeName>
</featureDescription>
<featureDescription>
<name>frost_TokenType</name>
<description></description>
<rangeTypeName>
uima.cas.Integer
</rangeTypeName>
</featureDescription>
</features>
</typeDescription> </types>
</typeSystemDescription> <typePriorities>
<priorityList>
<!-- <type>uima.tt.SentenceAnnotation</type> -->
<type>uima.tt.TokenAnnotation</type>
</priorityList> </typePriorities> <fsIndexCollection />
<capabilities> <capability>
<inputs>
<type allAnnotatorFeatures="true">
uima.tt.TokenAnnotation
</type>
<!-- <type
allAnnotatorFeatures="true">uima.tt.SentenceAnnotation</type>
<type
allAnnotatorFeatures="true">uima.tt.ParagraphAnnotation</type> -->
</inputs>
<outputs>
<type allAnnotatorFeatures="true">
org.apache.uima.conceptMapper.DictTerm
</type>
<type allAnnotatorFeatures="true">
uima.tt.TokenAnnotation
</type>
<type allAnnotatorFeatures="true">
org.apache.uima.conceptMapper.support.tokenizer.TokenAnnotation
</type>
<type allAnnotatorFeatures="true">
uima.tcas.DocumentAnnotation
</type>
</outputs>
<languagesSupported /> </capability>
</capabilities> <operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes> </operationalProperties>
</analysisEngineMetaData> <externalResourceDependencies>
<externalResourceDependency> <key>DictionaryFile</key>
<description>dictionary file loader.</description>
<interfaceName>
org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResou
rce </interfaceName> <optional>false</optional>
</externalResourceDependency> </externalResourceDependencies>
<resourceManagerConfiguration> <externalResources>
<externalResource>
<name>DictionaryFileName</name>
<description>
A file containing the dictionary. Modify this URL to
use a different dictionary.
</description>
<fileResourceSpecifier>
<fileUrl><PATH-TO-DICTIONARY>/testDict.xml</fileUrl>
</fileResourceSpecifier>
<implementationName>
org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResou
rce_impl
</implementationName> </externalResource>
</externalResources> <externalResourceBindings>
<externalResourceBinding>
<key>DictionaryFile</key>
<resourceName>DictionaryFileName</resourceName>
</externalResourceBinding> </externalResourceBindings>
</resourceManagerConfiguration> </taeDescription>
My dictionary file is
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<synonym>
<token canonical="United States" DOCNO="276145">
<variant base="United States"/>
<variant base="United States of America"/>
<variant base="the United States of America"/>
</token>
<token canonical="New York City" DOCNO="10000">
<variant base = "New York City"/>
<variant base = "NYC"/>
<variant base = "Big Apple"/>
</token>
<token canonical="Britain" DOCNO="276148">
<variant base="Britain"/>
</token>
<token canonical="Britain" DOCNO="276148,276148a">
<variant base="Brit'ain"/>
</token>
<token canonical=".AT" DOCNO="sdfasd">
<variant base=".AT"/>
</token>
<token canonical="1776" DOCNO="276148">
<variant base="1776"/>
</token>
<token canonical="Europe" DOCNO="276148">
<variant base="Europe"/>
</token>
<token canonical="American" DOCNO="276148">
<variant base="American"/>
<variant base="Americans"/>
</token>
<token canonical="House of Lords" DOCNO="276148">
<variant base="House of Lords"/>
</token>
<token canonical="House of Commons" DOCNO="276148">
<variant base="House of Commons"/>
</token>
<token canonical="This Is" DOCNO="276148">
<variant base="This Is"/>
</token>
<token canonical="bladder neck" SemClass="Site" AttributeType="ICDO"
AttributeValue="C67.5" POS="NN" key="12461" >
<variant base="bladder neck" POS="NN" key="12461" parent="12461" />
<variant base="internal urethral orifice" POS="NN" key="12462"
parent="12461" />
</token>
<token canonical="bladder, nos" SemClass="Site" AttributeType="ICDO"
AttributeValue="C67.9" POS="NN" key="12469" >
<variant base="bladder, nos" POS="NN" key="12469" parent="12469" />
<variant base="bladder wall, nos" POS="NN" key="12470"
parent="12469" />
<variant base="urinary bladder, nos" POS="NN" key="12471"
parent="12469" />
<variant base="bladder" POS="NN" key="12472" parent="12469" />
<variant base="bladder wall" POS="NN" key="12473" parent="12469" />
<variant base="urinary bladder" POS="NN" key="12474" parent="12469"
/>
</token>
</synonym>