You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@uima.apache.org by Imran <im...@gmail.com> on 2014/12/24 11:47:54 UTC

Unable to concept map a text with apache UIMA ConceptMapper

I am trying to write a small example of Apache UIMA Concept Mapper but I 
am unable to map the dictionary tokens on passed text. Please help me 
with this. My current source code and settings are:




AnalysisEngineDescription conceptMapperDesc = 
UIMAFramework.getXMLParser()
        .parseAnalysisEngineDescription(new XMLInputSource("<PATH-TO-
File>/ConceptMapperOffsetTokenizer.xml"));
AnalysisEngine ae = 
UIMAFramework.produceAnalysisEngine(conceptMapperDesc);

JCas jcas = ae.newJCas();

jcas.setDocumentText("The Big Apple is a nickname for New York City");
ae.process(jcas);

FSIterator<?> iter = 
jcas.getAnnotationIndex(org.apache.uima.conceptMapper.DictTerm.type).ite
rator();
while (iter.hasNext()) {
    Annotation term = (Annotation)iter.next();
    System.out.println(term);
}






ConceptMapperOffsetTokenizer.xml is :







<?xml version="1.0" encoding="UTF-8"?> <!--   Licensed to the Apache 
Software Foundation (ASF) under one   or more contributor license 
agreements.  See the NOTICE file   distributed with this work for 
additional information   regarding copyright ownership.  The ASF 
licenses this file   to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file except in compliance   with the 
License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
     Unless required by applicable law or agreed to in writing,   
software distributed under the License is distributed on an   "AS IS" 
BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY   KIND, either express or 
implied.  See the License for the   specific language governing 
permissions and limitations   under the License.    
-->   <taeDescription xmlns="http://uima.apache.org/resourceSpecifier">     
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>     
<primitive>true</primitive>     
<annotatorImplementationName>org.apache.uima.conceptMapper.ConceptMapper
</annotatorImplementationName>  <analysisEngineMetaData>        
<name>ConceptMapper</name>      <description></description>         
<version>1</version>        <vendor></vendor>       
<configurationParameters>           <configurationParameter>
                <name>caseMatch</name>
                <description>
                    this parameter specifies the case folding mode:
                    ignoreall - fold everything to lowercase for
                    matching insensitive - fold only tokens with initial
                    caps to lowercase digitfold - fold all (and only)
                    tokens with a digit sensitive - perform no case
                    folding
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>Stemmer</name>
                <description>
                    Name of stemmer class to use before matching. MUST
                    have a zero-parameter constructor! If not specified,
                    no stemming will be performed.
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>ResultingAnnotationName</name>
                <description>
                    Name of the annotation type created by this TAE,
                    must match the typeSystemDescription entry
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>ResultingEnclosingSpanName</name>
                <description>
                    Name of the feature in the resultingAnnotation to
                    contain the span that encloses it (i.e. its
                    sentence)
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>AttributeList</name>
                <description>
                    List of attribute names for XML dictionary entry
                    record - must correspond to FeatureList
                </description>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>FeatureList</name>
                <description>
                    List of feature names for CAS annotation - must
                    correspond to AttributeList
                </description>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>TokenAnnotation</name>
                <description></description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>TokenClassFeatureName</name>
                <description>
                    Name of feature used when doing lookups against
                    IncludedTokenClasses and ExcludedTokenClasses
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>TokenTextFeatureName</name>
                <description></description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>SpanFeatureStructure</name>
                <description>
                    Type of annotation which corresponds to spans of
                    data for processing (e.g. a Sentence)
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>OrderIndependentLookup</name>
                <description>
                    True if should ignore element order during lookup
                    (i.e., "top box" would equal "box top"). Default is
                    False.
                </description>
                <type>Boolean</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>TokenTypeFeatureName</name>
                <description>
                    Name of feature used when doing lookups against
                    IncludedTokenTypes and ExcludedTokenTypes
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>IncludedTokenTypes</name>
                <description>
                    Type of tokens to include in lookups (if not
                    supplied, then all types are included except those
                    specifically mentioned in ExcludedTokenTypes)
                </description>
                <type>Integer</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>ExcludedTokenTypes</name>
                <description></description>
                <type>Integer</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>ExcludedTokenClasses</name>
                <description>
                    Class of tokens to exclude from lookups (if not
                    supplied, then all classes are excluded except those
                    specifically mentioned in IncludedTokenClasses,
                    unless IncludedTokenClasses is not supplied, in
                    which case none are excluded)
                </description>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>IncludedTokenClasses</name>
                <description>
                    Class of tokens to include in lookups (if not
                    supplied, then all classes are included except those
                    specifically mentioned in ExcludedTokenClasses)
                </description>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>TokenClassWriteBackFeatureNames</name>
                <description>
                    names of features that should be written back to a
                    token, such as a POS tag
                </description>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>ResultingAnnotationMatchedTextFeature</name>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>PrintDictionary</name>
                <type>Boolean</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>SearchStrategy</name>
                <description>
                    Can be either "SkipAnyMatch",
                    "SkipAnyMatchAllowOverlap" or
                    "ContiguousMatch"&#13;&#13;ContiguousMatch: longest
                    match of contiguous tokens within enclosing
                    span(taking into account included/excluded items).
                    DEFAULT strategy &#13;SkipAnyMatch: longest match of
                    not-necessarily contiguous tokens within enclosing
                    span (taking into account included/excluded items).
                    Subsequent lookups begin in span after complete
                    match. IMPLIES order-independent lookup
                    &#13;SkipAnyMatchAllowOverlap: longest match of
                    not-necessarily contiguous tokens within enclosing
                    span (taking into account included/excluded items).
                    Subsequent lookups begin in span after next token.
                    IMPLIES order-independent lookup
                </description>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>StopWords</name>
                <type>String</type>
                <multiValued>true</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>FindAllMatches</name>
                <type>Boolean</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>MatchedTokensFeatureName</name>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>ReplaceCommaWithAND</name>
                <type>Boolean</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>           <configurationParameter>
                <name>TokenizerDescriptorPath</name>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>true</mandatory>             
</configurationParameter>           <configurationParameter>
                <name>LanguageID</name>
                <type>String</type>
                <multiValued>false</multiValued>
                <mandatory>false</mandatory>            
</configurationParameter>       </configurationParameters>      
<configurationParameterSettings>            <nameValuePair>
                <name>caseMatch</name>
                <value>
                    <string>ignoreall</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>AttributeList</name>
                <value>
                    <array>
                        <string>canonical</string>
                    </array>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>FeatureList</name>
                <value>
                    <array>
                        <string>DictCanon</string>
                    </array>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>TokenAnnotation</name>
                <value>
                    <string>uima.tt.TokenAnnotation</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>ResultingAnnotationName</name>
                <value>
                    <string>
                        org.apache.uima.conceptMapper.DictTerm
                    </string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>SpanFeatureStructure</name>
                <value>
                    <string>uima.tcas.DocumentAnnotation</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>OrderIndependentLookup</name>
                <value>
                    <boolean>false</boolean>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>TokenClassWriteBackFeatureNames</name>
                <value>
                    <array />
                </value>            </nameValuePair>            
<nameValuePair>
                <name>IncludedTokenClasses</name>
                <value>
                    <array />
                </value>            </nameValuePair>            
<nameValuePair>
                <name>PrintDictionary</name>
                <value>
                    <boolean>false</boolean>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>FindAllMatches</name>
                <value>
                    <boolean>false</boolean>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>StopWords</name>
                <value>
                    <array />
                </value>            </nameValuePair>            
<nameValuePair>
                <name>ReplaceCommaWithAND</name>
                <value>
                    <boolean>false</boolean>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>TokenizerDescriptorPath</name>
                <value>
                    <string>
                        
D:\SupportingProjects\ConceptMapper\desc\analysis_engine\primitive\Offse
tTokenizer.xml
                    </string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>ResultingEnclosingSpanName</name>
                <value>
                    <string>enclosingSpan</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>MatchedTokensFeatureName</name>
                <value>
                    <string>matchedTokens</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>ResultingAnnotationMatchedTextFeature</name>
                <value>
                    <string>matchedText</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>SearchStrategy</name>
                <value>
                    <string>ContiguousMatch</string>
                </value>            </nameValuePair>            
<nameValuePair>
                <name>LanguageID</name>
                <value>
                    <string>en</string>
                </value>            </nameValuePair>        
</configurationParameterSettings>       <typeSystemDescription>             
<imports>
                <import name="org.apache.uima.conceptMapper.DictTerm" />
                <import
                    
name="org.apache.uima.conceptMapper.support.tokenizer.TokenAnnotation" 
/>           </imports>          <types>
                <typeDescription>
                    <name>uima.tt.TokenAnnotation</name>
                    <description></description>
                    <supertypeName>uima.tcas.Annotation</supertypeName>
                    <features>
                        <featureDescription>
                            <name>SemClass</name>
                            <description>
                                semantic class of token
                            </description>
                            <rangeTypeName>
                                uima.cas.String
                            </rangeTypeName>
                        </featureDescription>
                        <featureDescription>
                            <name>POS</name>
                            <description>
                                Part of SPeech of term to which this
                                token is a part
                            </description>
                            <rangeTypeName>
                                uima.cas.String
                            </rangeTypeName>
                        </featureDescription>
                        <featureDescription>
                            <name>frost_TokenType</name>
                            <description></description>
                            <rangeTypeName>
                                uima.cas.Integer
                            </rangeTypeName>
                        </featureDescription>
                    </features>
                </typeDescription>          </types>        
</typeSystemDescription>        <typePriorities>            
<priorityList>
                <!-- <type>uima.tt.SentenceAnnotation</type> -->
                <type>uima.tt.TokenAnnotation</type>            
</priorityList>         </typePriorities>       <fsIndexCollection />       
<capabilities>          <capability>
                <inputs>
                    <type allAnnotatorFeatures="true">
                        uima.tt.TokenAnnotation
                    </type>
                    <!-- <type 
allAnnotatorFeatures="true">uima.tt.SentenceAnnotation</type>
                        <type 
allAnnotatorFeatures="true">uima.tt.ParagraphAnnotation</type> -->
                </inputs>
                <outputs>
                    <type allAnnotatorFeatures="true">
                        org.apache.uima.conceptMapper.DictTerm
                    </type>
                    <type allAnnotatorFeatures="true">
                        uima.tt.TokenAnnotation
                    </type>
                    <type allAnnotatorFeatures="true">
                        
org.apache.uima.conceptMapper.support.tokenizer.TokenAnnotation
                    </type>
                    <type allAnnotatorFeatures="true">
                        uima.tcas.DocumentAnnotation
                    </type>
                </outputs>
                <languagesSupported />          </capability>       
</capabilities>         <operationalProperties>             
<modifiesCas>true</modifiesCas>             
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>             
<outputsNewCASes>false</outputsNewCASes>        </operationalProperties>    
</analysisEngineMetaData>   <externalResourceDependencies>      
<externalResourceDependency>            <key>DictionaryFile</key>           
<description>dictionary file loader.</description>          
<interfaceName>
                
org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResou
rce             </interfaceName>            <optional>false</optional>      
</externalResourceDependency>   </externalResourceDependencies>     
<resourceManagerConfiguration>      <externalResources>             
<externalResource>
                <name>DictionaryFileName</name>
                <description>
                    A file containing the dictionary. Modify this URL to
                    use a different dictionary.
                </description>
                <fileResourceSpecifier>
                    <fileUrl><PATH-TO-DICTIONARY>/testDict.xml</fileUrl>
                </fileResourceSpecifier>
                <implementationName>
                    
org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResou
rce_impl
                </implementationName>           </externalResource>         
</externalResources>        <externalResourceBindings>          
<externalResourceBinding>
                <key>DictionaryFile</key>
                <resourceName>DictionaryFileName</resourceName>             
</externalResourceBinding>      </externalResourceBindings>     
</resourceManagerConfiguration> </taeDescription>












My dictionary file is 










<?xml version="1.0" encoding="UTF-8" ?>
<!--
  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing,
  software distributed under the License is distributed on an
  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  KIND, either express or implied.  See the License for the
  specific language governing permissions and limitations
  under the License.    
-->
<synonym>
  <token canonical="United States" DOCNO="276145">
    <variant base="United States"/>
    <variant base="United States of America"/>
    <variant base="the United States of America"/>
  </token>
  <token canonical="New York City" DOCNO="10000">
    <variant base = "New York City"/>
    <variant base = "NYC"/>
    <variant base = "Big Apple"/>
  </token>
  <token canonical="Britain" DOCNO="276148">
    <variant base="Britain"/>
  </token>
  <token canonical="Britain" DOCNO="276148,276148a">
    <variant base="Brit'ain"/>
  </token>
  <token canonical=".AT" DOCNO="sdfasd">
    <variant base=".AT"/>
  </token>
  <token canonical="1776" DOCNO="276148">
    <variant base="1776"/>
  </token>
  <token canonical="Europe" DOCNO="276148">
    <variant base="Europe"/>
  </token>
  <token canonical="American" DOCNO="276148">
    <variant base="American"/>
    <variant base="Americans"/>
  </token>
  <token canonical="House of Lords" DOCNO="276148">
    <variant base="House of Lords"/>
  </token>
  <token canonical="House of Commons" DOCNO="276148">
    <variant base="House of Commons"/>
  </token>
  <token canonical="This Is" DOCNO="276148">
    <variant base="This Is"/>
  </token>
  <token canonical="bladder neck" SemClass="Site" AttributeType="ICDO" 
AttributeValue="C67.5" POS="NN" key="12461" >
    <variant base="bladder neck" POS="NN" key="12461" parent="12461" />
    <variant base="internal urethral orifice" POS="NN" key="12462" 
parent="12461" />
  </token>
  <token canonical="bladder, nos" SemClass="Site" AttributeType="ICDO" 
AttributeValue="C67.9" POS="NN" key="12469" >
    <variant base="bladder, nos" POS="NN" key="12469" parent="12469" />
    <variant base="bladder wall, nos" POS="NN" key="12470" 
parent="12469" />
    <variant base="urinary bladder, nos" POS="NN" key="12471" 
parent="12469" />
    <variant base="bladder" POS="NN" key="12472" parent="12469" />
    <variant base="bladder wall" POS="NN" key="12473" parent="12469" />
    <variant base="urinary bladder" POS="NN" key="12474" parent="12469" 
/>
  </token>
</synonym>