You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by mb...@apache.org on 2008/06/23 14:31:42 UTC
svn commit: r670541 [2/4] - in /incubator/uima/sandbox/trunk/ConceptMapper: ./ desc/ desc/analysis_engine/ desc/analysis_engine/aggregate/ desc/analysis_engine/primitive/ desc/collection_processing_engines/ doc/ resources/ resources/dict/ src/ src/main...

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/ConceptMapper.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/ConceptMapper.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/ConceptMapper.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/ConceptMapper.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,1025 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.conceptMapper;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+import java.util.TreeMap;
+
+import org.apache.uima.analysis_engine.ResultSpecification;
+import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
+import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
+import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
+import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
+import org.apache.uima.analysis_engine.annotator.Annotator_ImplBase;
+import org.apache.uima.analysis_engine.annotator.TextAnnotator;
+import org.apache.uima.cas.FSIndex;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource;
+import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource.DictEntry;
+import org.apache.uima.conceptMapper.support.tokens.TokenFilter;
+import org.apache.uima.conceptMapper.support.tokens.TokenNormalizer;
+import org.apache.uima.conceptMapper.support.tokens.UnknownTypeException;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ConceptMapper extends Annotator_ImplBase implements TextAnnotator {
+
+  /** Configuration parameter key/label for the dictionary file to load */
+  public static final String PARAM_DICT_FILE = "DictionaryFile";
+
+  /**
+   * Configuration parameter for name of token class feature of token annotations, to distinguish
+   * classes of tokens to skip during lookups. Token class features are Strings.
+   */
+  public static final String PARAM_TOKENCLASSFEATURENAME = "TokenClassFeatureName";
+
+  private String tokenClassFeatureName;
+
+  /**
+   * Configuration parameter for name of token type feature of token annotations, to distinguish
+   * types of tokens to skip during lookups. Token type features are Integers
+   */
+  public static final String PARAM_TOKENTYPEFEATURENAME = "TokenTypeFeatureName";
+
+  private String tokenTypeFeatureName;
+
+  /** Configuration parameter key/label for the annotation name */
+  public static final String PARAM_ANNOTATION_NAME = "ResultingAnnotationName";
+
+  /**
+   * Configuration parameter key/label for the name of the feature that contains the resulting
+   * term's span, i.e. sentence
+   */
+  public static final String PARAM_ENCLOSINGSPAN = "ResultingEnclosingSpanName";
+
+  private String resultEnclosingSpanName;
+
+  private Feature resultEnclosingSpan;
+
+  /**
+   * Configuration parameter feature in resulting annotation to store text matched in successful
+   * dict lookup
+   */
+  public static final String PARAM_MATCHEDFEATURE = "ResultingAnnotationMatchedTextFeature";
+
+  private String resultMatchedTextFeatureName;
+
+  private Feature resultMatchedTextFeature;
+
+  /** Configuration parameter key/label for the attribute list */
+  public static final String PARAM_ATTRIBUTE_LIST = "AttributeList";
+
+  /** Configuration parameter key/label for the feature list */
+  public static final String PARAM_FEATURE_LIST = "FeatureList";
+
+  /** Configuration parameter giving type of tokens */
+  public static final String PARAM_TOKENANNOTATION = "TokenAnnotation";
+
+  private String tokenAnnotationName;
+
+  /**
+   * Configuration parameter specifying name of token's feature containing text. If not specified,
+   * the token annotation's covered text is used
+   */
+  public static final String PARAM_TOKENTEXTFEATURENAME = "TokenTextFeatureName";
+
+  private String tokenTextFeatureName;
+
+  private Feature tokenTextFeature;
+
+  /**
+   * array of features of the token annotation which should be written back to the token from the
+   * resulting entry. For example, if a Part of Speech is specified as part of a dictionary entry,
+   * it could be written back to the token so that a subsequent POS tagger would be able to use it
+   * as a preannotated item.
+   */
+  public static final String PARAM_TOKENCLASSWRITEBACKFEATURENAMES = "TokenClassWriteBackFeatureNames";
+
+  private String[] tokenClassWriteBackFeatureNames;
+
+  private Feature[] tokenClassWriteBackFeatures;
+
+  /**
+   * Configuration parameter for name of feature in result annotations to contain list of matched
+   * tokens
+   */
+  public static final String PARAM_MATCHEDTOKENSFEATURENAME = "MatchedTokensFeatureName";
+
+  private String matchedTokensFeatureName;
+
+  private Feature matchedTokensFeature;
+
+  /**
+   * Configuration parameter key/label to indicate if order-independent lookup is to be performed.
+   * If true, words in a phrase are sorted alphabetically before lookup. This implies that a phrase
+   * "C D A" would be considered equivalent to "A C D" and "D A C", etc.
+   */
+  public static final String PARAM_ORDERINDEPENDENTLOOKUP = "OrderIndependentLookup";
+
+  private boolean sortElements;
+
+  private final static int ContiguousMatch = 1;
+
+  public static final String PARAMVALUE_CONTIGUOUSMATCH = "ContiguousMatch";
+
+  private final static int SkipAnyMatch = 2;
+
+  public static final String PARAMVALUE_SKIPANYMATCH = "SkipAnyMatch";
+
+  private static final int SkipAnyMatchAllowOverlap = 3;
+
+  public static final String PARAMVALUE_SKIPANYMATCHALLOWOVERLAP = "SkipAnyMatchAllowOverlap";
+
+  private final static int DefaultSearchStrategy = ContiguousMatch;
+
+  // private final static int DefaultSearchStrategy = SkipAnyMatch;
+
+  /**
+   * Configuration parameter to indicate search strategy, either: LongestMatch: longest match of
+   * contiguous tokens within enclosing span(taking into account included/excluded items). DEFAULT
+   * strategy SkipAnyMatch: longest match of noncontiguous tokens enclosing span (taking into
+   * account included/excluded items). IMPLIES order-independent lookup
+   */
+  public static final String PARAM_SEARCHSTRATEGY = "SearchStrategy";
+
+  private int searchStrategy = DefaultSearchStrategy;
+
+  public static final String PARAM_FINDALLMATCHES = "FindAllMatches";
+
+  private boolean findAllMatches;
+
+  /** object used to stem/case normalize text */
+  private TokenNormalizer tokenNormalizer;
+
+  private TokenFilter tokenFilter;
+
+  /** The name of the annotation type posted to the CAS by this TAE */
+  protected String resultAnnotationName;
+
+  /** The type of annotation posted to the CAS by this TAE */
+  protected Type resultAnnotationType;
+
+  /** The type of token annotations to consider */
+  protected Type tokenType;
+
+  /**
+   * Array of Feature objects associated with {link #annotationType annotationType}
+   */
+  protected Feature features[];
+
+  /** Array of feature names, obtained as a configuration parameter. */
+  protected String featureNames[];
+
+  /**
+   * Array of attribute names for the XML dictionary token element, obtained as a configuration
+   * parameter.
+   */
+  protected String attributeNames[];
+
+  /** The dictionary */
+  private DictionaryResource dict;
+
+  /**
+   * type of annotation that defines a block for processing, e.g. a sentence
+   */
+  private static final String PARAM_DATA_BLOCK_FS = "SpanFeatureStructure";
+
+  private String spanFeatureStructureName;
+
+  private Type spanFeatureStructureType;
+
+  private Logger logger;
+
+  private JCas jcas;
+
+  private static final String PARAM_TOKENIZERDESCRIPTOR = "TokenizerDescriptorPath";
+
+  // private FileWriter tokenDebugFile;
+  // private FileWriter potentialMatchDebugFile;
+  // private FileWriter findMatchDebugFile;
+
+  // private void debugWrite (FileWriter file, String text)
+  // {
+  // try
+  // {
+  // file.write(text + "\n");
+  // }
+  // catch (Exception e)
+  // {
+  // }
+  // }
+
+  /**
+   * Initialize the annotator, which includes compilation of regular expressions, fetching
+   * configuration parameters from XML descriptor file, and loading of the dictionary file.
+   */
+  public void initialize(AnnotatorContext annotatorContext) throws AnnotatorConfigurationException,
+          AnnotatorInitializationException {
+    super.initialize(annotatorContext);
+
+    // Process configration parameters
+    try {
+      // logger = new Logger (annotatorContext.getLogger ());
+      logger = new Logger("ConceptMapper", annotatorContext.getLogger());
+      // tokenDebugFile = new FileWriter("/tmp/cm/tokens."+
+      // Calendar.getInstance ().getTimeInMillis () + ".txt");
+      // potentialMatchDebugFile = new FileWriter("/tmp/cm/pm."+
+      // Calendar.getInstance ().getTimeInMillis () + ".txt");
+      // findMatchDebugFile = new FileWriter("/tmp/cm/fm."+
+      // Calendar.getInstance ().getTimeInMillis () + ".txt");
+      // FileWriter dictDebugFile = new FileWriter("/tmp/cm/dict."+
+      // Calendar.getInstance ().getTimeInMillis () + ".txt");
+
+      tokenAnnotationName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENANNOTATION);
+      String tokenizerDescriptor = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENIZERDESCRIPTOR);
+
+      tokenClassFeatureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENCLASSFEATURENAME);
+
+      tokenTypeFeatureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENTYPEFEATURENAME);
+
+      resultAnnotationName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_ANNOTATION_NAME);
+      resultEnclosingSpanName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_ENCLOSINGSPAN);
+
+      resultMatchedTextFeatureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_MATCHEDFEATURE);
+
+      featureNames = (String[]) annotatorContext.getConfigParameterValue(PARAM_FEATURE_LIST);
+      attributeNames = (String[]) annotatorContext.getConfigParameterValue(PARAM_ATTRIBUTE_LIST);
+
+      spanFeatureStructureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_DATA_BLOCK_FS);
+
+      tokenTextFeatureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENTEXTFEATURENAME);
+      tokenClassWriteBackFeatureNames = (String[]) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENCLASSWRITEBACKFEATURENAMES);
+
+      tokenAnnotationName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_TOKENANNOTATION);
+
+      matchedTokensFeatureName = (String) annotatorContext
+              .getConfigParameterValue(PARAM_MATCHEDTOKENSFEATURENAME);
+
+      Boolean sortElementsParam = (Boolean) annotatorContext
+              .getConfigParameterValue(PARAM_ORDERINDEPENDENTLOOKUP);
+      sortElements = (sortElementsParam == null) ? false : sortElementsParam.booleanValue();
+
+      searchStrategy = detectSearchStrategy((String) annotatorContext
+              .getConfigParameterValue(PARAM_SEARCHSTRATEGY));
+      // System.err.println("SEARCH STRATEGY = " + searchStrategy);
+
+      Boolean findAllMatchesParam = (Boolean) annotatorContext
+              .getConfigParameterValue(PARAM_FINDALLMATCHES);
+      findAllMatches = (findAllMatchesParam == null) ? false : findAllMatchesParam.booleanValue();
+
+      // always do order-independent lookup if performing "SkipAnyMatch"
+      // lookups
+      if (searchStrategy == SkipAnyMatch) {
+        sortElements = true;
+      }
+
+      if (featureNames.length != attributeNames.length) {
+        throw new Exception("AttributeList and FeatureList are inconsistent");
+      }
+      // for (int i = 0; i < featureNames.length; i++ )
+      // {
+      // logger.logInfo ("Attribute \"" + attributeNames [i] + "\" mapped
+      // to feature \"" + featureNames [i] + "\"");
+      // }
+
+      tokenNormalizer = new TokenNormalizer(annotatorContext, logger);
+      tokenFilter = new TokenFilter(tokenAnnotationName, tokenTypeFeatureName,
+              tokenClassFeatureName, logger);
+      tokenFilter.initConfig(annotatorContext);
+
+      dict = (DictionaryResource) annotatorContext.getResourceObject(PARAM_DICT_FILE);
+      if (!dict.isLoaded()) {
+        // logger.logInfo("dictionary not yet loaded");
+        dict.loadDictionaryContents(annotatorContext, logger, tokenAnnotationName,
+                tokenTypeFeatureName, tokenClassFeatureName, tokenizerDescriptor);
+        // logger.logInfo( "now is loaded: "+dict.toString() );
+        // System.err.println ("NEW DICTIONARY:\n" + dict.toString());
+        // debugWrite (dictDebugFile, dict.toString());
+      }
+
+    } catch (Exception e) {
+      throw new AnnotatorConfigurationException(e);
+    }
+  }
+
+  private int detectSearchStrategy(String strategyString) throws AnnotatorConfigurationException {
+    if ((strategyString == null) || (strategyString.equals(""))) {
+      return DefaultSearchStrategy;
+    } else if (strategyString.equals(PARAMVALUE_CONTIGUOUSMATCH)) {
+      return ContiguousMatch;
+    } else if (strategyString.equals(PARAMVALUE_SKIPANYMATCH)) {
+      return SkipAnyMatch;
+    } else if (strategyString.equals(PARAMVALUE_SKIPANYMATCHALLOWOVERLAP)) {
+      return SkipAnyMatchAllowOverlap;
+    } else {
+      throw new AnnotatorConfigurationException();
+    }
+  }
+
+  /**
+   * Perform local type system initialization.
+   * 
+   * @param aTypeSystem
+   *          the current type system.
+   * @see org.apache.uima.analysis_engine.annotator.TextAnnotator#typeSystemInit(TypeSystem)
+   */
+  public void typeSystemInit(TypeSystem typeSystem) throws AnnotatorConfigurationException,
+          AnnotatorInitializationException {
+
+    tokenType = typeSystem.getType(tokenAnnotationName);
+    if (tokenType == null) {
+      logger.logError(PARAM_TOKENANNOTATION + " '" + tokenAnnotationName
+              + "' specified, but does not exist");
+      throw new AnnotatorInitializationException();
+    }
+
+    if ((tokenTextFeatureName == null) || (tokenTextFeatureName.equals(""))) {
+      tokenTextFeature = null;
+    } else {
+      tokenTextFeature = tokenType.getFeatureByBaseName(tokenTextFeatureName);
+      if (tokenTextFeature == null) {
+        logger.logError(PARAM_TOKENTEXTFEATURENAME + " '" + tokenTextFeatureName
+                + "' specified, but does not exist for type: " + tokenType.getName());
+        throw new AnnotatorInitializationException();
+      }
+    }
+
+    if ((tokenClassWriteBackFeatureNames != null) && (tokenClassWriteBackFeatureNames.length > 0)) {
+      tokenClassWriteBackFeatures = new Feature[tokenClassWriteBackFeatureNames.length];
+      for (int i = 0; i < tokenClassWriteBackFeatureNames.length; i++) {
+        tokenClassWriteBackFeatures[i] = tokenType
+                .getFeatureByBaseName(tokenClassWriteBackFeatureNames[i]);
+        if (tokenClassWriteBackFeatures[i] == null) {
+          logger.logError(PARAM_TOKENCLASSWRITEBACKFEATURENAMES + "[" + i + "] '"
+                  + tokenClassWriteBackFeatureNames[i]
+                  + "' specified, but does not exist for type: " + tokenType.getName());
+          throw new AnnotatorInitializationException();
+        }
+      }
+    } else {
+      tokenClassWriteBackFeatures = null;
+    }
+
+    spanFeatureStructureType = typeSystem.getType(spanFeatureStructureName);
+    if (spanFeatureStructureType == null) {
+      logger.logError(PARAM_DATA_BLOCK_FS + " '" + spanFeatureStructureName
+              + "' specified, but does not exist for type: " + tokenType.getName());
+      throw new AnnotatorInitializationException();
+    }
+
+    resultAnnotationType = typeSystem.getType(resultAnnotationName);
+    if (resultAnnotationType == null) {
+      logger.logError(PARAM_ANNOTATION_NAME + " '" + resultAnnotationName
+              + "' specified, but does not exist");
+      throw new AnnotatorInitializationException();
+    }
+
+    if ((resultEnclosingSpanName == null) || (resultEnclosingSpanName.equals(""))) {
+      resultEnclosingSpan = null;
+    } else {
+      resultEnclosingSpan = resultAnnotationType.getFeatureByBaseName(resultEnclosingSpanName);
+      if (resultEnclosingSpan == null) {
+        logger.logError(PARAM_ENCLOSINGSPAN + " '" + resultEnclosingSpanName
+                + "' specified, but does not exist for type: " + resultAnnotationType.getName());
+        throw new AnnotatorInitializationException();
+      }
+    }
+
+    if ((resultMatchedTextFeatureName == null) || (resultMatchedTextFeatureName.equals(""))) {
+      resultMatchedTextFeature = null;
+    } else {
+      resultMatchedTextFeature = resultAnnotationType
+              .getFeatureByBaseName(resultMatchedTextFeatureName);
+      if (resultMatchedTextFeature == null) {
+        logger.logError(PARAM_MATCHEDFEATURE + " '" + resultMatchedTextFeatureName
+                + "' specified, but does not exist for type: " + resultAnnotationType.getName());
+        throw new AnnotatorInitializationException();
+      }
+    }
+
+    if ((matchedTokensFeatureName == null) || (matchedTokensFeatureName.equals(""))) {
+      matchedTokensFeature = null;
+    } else {
+      matchedTokensFeature = resultAnnotationType.getFeatureByBaseName(matchedTokensFeatureName);
+      if (matchedTokensFeature == null) {
+        logger.logError(PARAM_MATCHEDTOKENSFEATURENAME + " '" + matchedTokensFeatureName
+                + "' specified, but does not exist for type: " + resultAnnotationType.getName());
+        throw new AnnotatorInitializationException();
+      }
+    }
+
+    int numFeatures = featureNames.length;
+    features = new Feature[numFeatures];
+
+    for (int i = 0; i < numFeatures; i++) {
+      features[i] = resultAnnotationType.getFeatureByBaseName(featureNames[i]);
+      if (features[i] == null) {
+        logger.logError(PARAM_FEATURE_LIST + "[" + i + "] '" + featureNames[i]
+                + "' specified, but does not exist for type: " + resultAnnotationType.getName());
+        // System.err.println (PARAM_FEATURE_LIST + "[" + i + "] '" +
+        // featureNames[i] + "' specified, but does not exist for type:
+        // " + resultAnnotationType.getName());
+        throw new AnnotatorInitializationException();
+      }
+
+    }
+
+    try {
+      tokenFilter.initTypes(typeSystem);
+    } catch (UnknownTypeException e) {
+      throw new AnnotatorInitializationException(e);
+    }
+  }
+
+  /**
+   * Perform the actual analysis. Iterate over the document content looking for any matching words
+   * or phrases in the loaded dictionary and post an annotation for each match found.
+   * 
+   * @param aTCAS
+   *          the current CAS to process.
+   * @param aResultSpec
+   *          a specification of the result annotation that should be created by this annotator
+   * 
+   * @see org.apache.uima.analysis_engine.annotator.TextAnnotator#process(CAS,ResultSpecification)
+   */
+  public void process(CAS tcas, ResultSpecification aResultSpec) throws AnnotatorProcessException {
+    // System.err.println ("ConceptMapper.process() begin");
+
+    AnnotationFS token;
+
+    try {
+      setJCas(tcas.getJCas()); // this is needed to get around an issue
+      // where UIMA crashes if no JCas is
+      // referenced
+      // logger.setupDocument (getJCas ());
+
+      FSIndex dbIndex = tcas.getAnnotationIndex(spanFeatureStructureType);
+      FSIterator spanIterator = dbIndex.iterator();
+
+      AnnotationIndex tokenIndex = (AnnotationIndex) tcas.getAnnotationIndex(tokenType);
+
+      while (spanIterator.hasNext()) {
+        ArrayList<AnnotationFS> tokens = new ArrayList<AnnotationFS>(2048);
+
+        Annotation spanAnnotation = (Annotation) spanIterator.next();
+
+        FSIterator tokenIter = tokenIndex.subiterator(spanAnnotation);
+
+        // System.err.println ("Tokens:");
+
+        // get all tokens for the specified block
+        while (tokenIter.hasNext()) {
+          token = (AnnotationFS) tokenIter.next();
+          // System.err.print ("--> token: '" + token.getCoveredText()
+          // + "' ");
+          if (tokenFilter.isOK_Token(token, tokenNormalizer)) {
+            // System.err.println("--> ADDING token: " +
+            // token.getCoveredText());
+            // debugWrite(tokenDebugFile, "--> ADDING token: " +
+            // token.getCoveredText() + ", type: " +
+            // token.getIntValue (tokenTypeFeature) + ", checkType:
+            // " + checkTokenType (token));
+
+            tokens.add(token);
+          }
+          // else
+          // {
+          // System.err.println("-->NOT! ADDING token: " +
+          // token.getCoveredText());
+          // debugWrite(tokenDebugFile, "-->NOT! ADDING token: " +
+          // token.getCoveredText() + ", type: " + token.getIntValue
+          // (tokenTypeFeature) + ", checkType: " + checkTokenType
+          // (token));
+          // }
+        }
+        // System.err.println ();
+        // logger.logInfo("Number of tokens: " + tokens.size());
+
+        switch (searchStrategy) {
+          case SkipAnyMatch:
+          case SkipAnyMatchAllowOverlap:
+            processTokenListSkipAny(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation);
+            break;
+          case ContiguousMatch:
+            processTokenList(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation);
+            break;
+          default:
+            processTokenList(searchStrategy, findAllMatches, tcas, tokens, spanAnnotation);
+            break;
+        }
+
+      }
+      // logger.logFinest("Number of annotations in CAS: " +
+      // (tcas.getAnnotationIndex().size() - 1));
+      // System.out.println("Number of annotations in CAS: " +
+      // (tcas.getAnnotationIndex().size() - 1));
+    } catch (Exception e) {
+      throw new AnnotatorProcessException(e);
+    }
+    // System.err.println ("ConceptMapper.process() end");
+  }
+
+  private void setJCas(JCas jcas) {
+    this.jcas = jcas;
+  }
+
+  private JCas getJCas() {
+    return this.jcas;
+  }
+
+  private void processTokenListSkipAny(int searchStrategy, boolean findAllMatches, CAS tcas,
+          ArrayList<AnnotationFS> tokens, Annotation spanAnnotation) {
+    AnnotationFS token;
+    // iterate over vector of tokens
+
+    ArrayList<String> normalizedTokens = new ArrayList<String>();
+
+    // mapping from words in sentence to list of dictionary entries starting with that word
+    Map<String, Collection<DictEntry>> potentialEntries = new HashMap<String, Collection<DictEntry>>();
+
+    // iterate through all tokens within span and collect dict entries for each unique one
+    for (int whichToken = 0; whichToken < tokens.size(); whichToken++) {
+      token = tokens.get(whichToken);
+      String tokenText = getTokenText(token);
+
+      String word = tokenNormalizer.normalize(tokenText);
+      normalizedTokens.add(word);
+
+      // logger.logInfo("ENTRY SEARCH/ORIGINAL: " + word + " / " +
+      // tokenText);
+      // System.err.println("ENTRY SEARCH/ORIGINAL: " + word + " / " +
+      // tokenText);
+    }
+    potentialEntries = findPotentialEntries(normalizedTokens, dict);
+
+    // System.err.println ("processTokenListSkipAny finding matches for " +
+    // normalizedTokens.toString ());
+
+    findMatchesSkipAnyToken(searchStrategy, findAllMatches, tcas, tokens, normalizedTokens,
+            potentialEntries, spanAnnotation);
+  }
+
+  private Map<String, Collection<DictEntry>> findPotentialEntries(
+          ArrayList<String> normalizedTokens, DictionaryResource dict) {
+    HashMap<String, Collection<DictEntry>> potentialEntries = new HashMap<String, Collection<DictEntry>>();
+
+    Iterator<String> tokenIter = normalizedTokens.iterator();
+    while (tokenIter.hasNext()) {
+      String word = tokenIter.next();
+      Collection<DictEntry> entries = potentialEntries.get(word);
+
+      if (entries == null) {
+        entries = new ArrayList<DictEntry>();
+      }
+      DictionaryResource.DictEntriesByLength entriesByLength = dict.getEntries(word);
+      if (entriesByLength != null) {
+        int shortest = entriesByLength.getShortest().intValue();
+        int longest = entriesByLength.getLongest().intValue();
+        for (int currentLength = longest; currentLength >= shortest; currentLength--) {
+          DictionaryResource.DictEntries dictEntries = entriesByLength.getEntries(currentLength);
+          if (dictEntries != null) {
+            ArrayList<DictEntry> entryItems = dictEntries.getEntries();
+            Iterator<DictEntry> entryIter = entryItems.iterator();
+            while (entryIter.hasNext()) {
+              DictionaryResource.DictEntry entry = (DictionaryResource.DictEntry) entryIter.next();
+              // System.err.println("entryIter = " + entryIter +
+              // ", Entry: " + entry.getText ());
+              // debugWrite (potentialMatchDebugFile, "Entry: " +
+              // entry.getText ());
+              if ((normalizedTokens.containsAll(entry.getElements())) && (!entries.contains(entry))) {
+                entries.add(entry);
+                // System.err.println ("Added potential match: "
+                // + entry);
+                // debugWrite (potentialMatchDebugFile, "Added
+                // potential match: " + entry);
+              }
+            }
+          }
+        }
+      }
+
+      potentialEntries.put(word, entries);
+
+    }
+    return potentialEntries;
+  }
+
+  /**
+   * @param searchStrategy
+   * @param tcas
+   * @param tokens
+   *          list of token annotations
+   * @param normalizedTokens
+   *          list of token annotations as strings
+   * @param potentialEntries
+   *          list of possible matches from dictionary
+   * @param spanAnnotation
+   */
+  private void findMatchesSkipAnyToken(int searchStrategy, boolean findAllMatches, CAS tcas,
+          ArrayList<AnnotationFS> tokens, ArrayList<String> normalizedTokens,
+          Map<String, Collection<DictEntry>> potentialEntries, Annotation spanAnnotation) {
+    int whichToken = 0; // use index instead of iterator to simplify walking
+    // through parallel arrays (tokens/normalizedTokens)
+
+    while (whichToken < normalizedTokens.size()) {
+      // System.err.println ("findMatchesSkipAnyToken(), whichToken = " +
+      // whichToken + ", token: " + (String) normalizedTokens.get
+      // (whichToken));
+      Collection<DictEntry> entries = potentialEntries.get(normalizedTokens.get(whichToken));
+      if (entries == null) {
+        whichToken += 1;
+      } else {
+        Iterator<DictEntry> entryIter = entries.iterator();
+        boolean foundMatch = false;
+        while ((entryIter.hasNext() && (!foundMatch))) {
+          DictionaryResource.DictEntry entry = entryIter.next();
+
+          // System.err.println("entryIter = " + entryIter + ", Entry:
+          // " + entry.getText ());
+          // debugWrite (findMatchDebugFile, "Entry: " + entry.getText
+          // ());
+          // System.err.println("remainingTokens = " +
+          // normalizedTokens.subList (whichToken,
+          // normalizedTokens.size ()).toString ());
+          // debugWrite (findMatchDebugFile, "remainingTokens = " +
+          // normalizedTokens.subList (whichToken,
+          // normalizedTokens.size ()).toString ());
+
+          if (normalizedTokens.subList(whichToken, normalizedTokens.size()).containsAll(
+                  entry.getElements())) {
+            int lengthOfMatch = processMatch(tcas, tokens, normalizedTokens, spanAnnotation,
+                    whichToken, entry);
+            if (!findAllMatches) {
+              foundMatch = true;
+              if (searchStrategy == SkipAnyMatchAllowOverlap) {
+                whichToken += 1;
+              } else {
+                whichToken += lengthOfMatch;
+              }
+              // System.err.println ("Processed match, whichToken
+              // = " + whichToken);
+              // debugWrite (findMatchDebugFile, "Processed match,
+              // whichToken = " + whichToken);
+            }
+          }
+        }
+        if (!foundMatch) {
+          whichToken += 1;
+        }
+      }
+    }
+  }
+
+  /**
+   * @param tcas
+   * @param tokens
+   *          list of token annotations
+   * @param normalizedTokens
+   *          list of token annotations as strings
+   * @param spanAnnotation
+   * @param whichToken
+   *          current token index (for tokens/normalizedTokens)
+   * @param entry
+   *          matching dict entry
+   * @return length of match (in tokens)
+   */
+  private int processMatch(CAS tcas, ArrayList<AnnotationFS> tokens,
+          ArrayList<String> normalizedTokens, Annotation spanAnnotation, int whichToken,
+          DictionaryResource.DictEntry entry) {
+    int startingPoint = whichToken;
+    TreeMap<String, Integer> entryOccurences = findEntryOccurences(entry.getElements(), whichToken);
+    int begin = -1;
+    int end = 0;
+    StringBuffer matchedText = new StringBuffer();
+
+    // while there are still items to match against
+    ArrayList<AnnotationFS> matched = new ArrayList<AnnotationFS>();
+    while ((!entryOccurences.isEmpty()) && (whichToken < normalizedTokens.size())) {
+      String currentTokenText = normalizedTokens.get(whichToken);
+      // System.err.println ("matchedText: '" + matchedText + "',
+      // whichToken = " + whichToken + ", currentTokenText: " +
+      // currentTokenText);
+
+      // if the dict entry contains at least one more of the current
+      // token, process it
+      Integer count = entryOccurences.get(currentTokenText);
+      if (count != null) {
+        if (matchedText.length() != 0) {
+          matchedText.append(' ');
+        }
+        matchedText.append(currentTokenText);
+        // System.err.println ("matchedText: '" + matchedText + "'");
+
+        AnnotationFS realToken = tokens.get(whichToken);
+        // System.err.println ("realToken: '" + realToken.getCoveredText
+        // () + ", count.intValue () = " + count.intValue ());
+
+        begin = (begin == -1) ? realToken.getBegin() : Math.min(begin, realToken.getBegin());
+        end = Math.max(end, realToken.getEnd());
+        matched.add(realToken);
+        // decrement count, or remove entry if none left
+        if (count.intValue() == 1) {
+          entryOccurences.remove(currentTokenText);
+        } else {
+          entryOccurences.put(currentTokenText, new Integer(count.intValue() - 1));
+        }
+      }
+
+      whichToken += 1;
+    }
+    if (entryOccurences.isEmpty()) {
+      // System.err.println ("makeAnnotation, text: " +
+      // matchedText.toString ());
+      makeAnnotation(tcas, begin, end, entry.getProperties(), spanAnnotation, matchedText
+              .toString(), matched, logger);
+    }
+    // else
+    // {
+    // System.err.println ("whichToken = " + whichToken + ",
+    // normalizedTokens.size = " + normalizedTokens.size ());
+    // }
+
+    return whichToken - startingPoint;
+  }
+
+  // generate a map from tokens to number of occurences of that token
+  private TreeMap<String, Integer> findEntryOccurences(Collection<String> normalizedTokens,
+          int whichToken) {
+    TreeMap<String, Integer> result = new TreeMap<String, Integer>();
+
+    Iterator<String> iter = normalizedTokens.iterator();
+    while (iter.hasNext()) {
+      String token = iter.next();
+      Integer count = result.get(token);
+      if (count == null) {
+        count = new Integer(1);
+      } else {
+        count = new Integer(count.intValue() + 1);
+      }
+      result.put(token, count);
+
+    }
+    return result;
+  }
+
+  /**
+   * @param searchStrategy
+   * @param tcas
+   * @param tokens
+   * @param spanAnnotation
+   */
+  protected void processTokenList(int searchStrategy, boolean findAllMatches, CAS tcas,
+          ArrayList<AnnotationFS> tokens, Annotation spanAnnotation) {
+    AnnotationFS token;
+    // iterate over vector of tokens
+
+    int whichToken = 0;
+    int entryLength = 0;
+
+    while (whichToken < tokens.size()) {
+      token = tokens.get(whichToken);
+      String tokenText = getTokenText(token);
+      entryLength = 0;
+
+      String word = tokenNormalizer.normalize(tokenText);
+
+      // logger.logInfo("ENTRY SEARCH/ORIGINAL: " + word + " / " +
+      // tokenText);
+      // System.err.println("ENTRY SEARCH/ORIGINAL: " + word + ", Token["
+      // + whichToken + "]: " + tokenText);
+
+      DictionaryResource.DictEntriesByLength entriesByLength = dict.getEntries(word);
+      if (entriesByLength != null) {
+        entryLength = Math.min(entriesByLength.getLongest().intValue(),
+                (tokens.size() - whichToken));
+        // logger.logInfo("ENTRY FOUND for: " + word + ", longest: " +
+        // entryLength + ", shortest: " + minLength);
+        // System.err.println("ENTRY FOUND for: " + word + ", longest: "
+        // + entryLength + ", shortest: " + minLength);
+        // System.err.println("ENTRY FOUND for: " + word + ", longest: "
+        // + entryLength);
+
+        entryLength = defaultMatcher(findAllMatches, tcas, tokens, spanAnnotation, whichToken,
+                entryLength, token.getBegin(), entriesByLength, entriesByLength.getShortest()
+                        .intValue());
+
+      }
+      whichToken += entryLength + 1;
+    }
+  }
+
+  private int defaultMatcher(boolean findAllMatches, CAS tcas, ArrayList<AnnotationFS> tokens,
+          Annotation spanAnnotation, int whichToken, int entryLength, int start,
+          DictionaryResource.DictEntriesByLength lengthEntries, int minLength) {
+    boolean entryFound = false;
+    // search through all entry lengths, as necessary
+    while ((!entryFound) && (entryLength >= minLength)) {
+      String tokensToMatch = buildTokenString(tokens, whichToken, entryLength, sortElements);
+      // System.err.println(">>> tokensToMatch = " + tokensToMatch);
+      DictionaryResource.DictEntries entriesByLength = lengthEntries.getEntries(entryLength);
+      // System.err.println(">>> entriesByLength = " + entriesByLength);
+      if (entriesByLength != null) {
+        ArrayList<DictionaryResource.DictEntry> entries = entriesByLength.getEntries();
+        DictionaryResource.DictEntry dictEntry = findMatchingEntry(entries, tokensToMatch);
+        if (dictEntry != null) {
+          // System.err.println("===> MATCH: '" + tokensToMatch + "'");
+
+          AnnotationFS endToken = tokens.get(whichToken + entryLength - 1);
+          // System.err.println(">>>"+dictEntry.getUnsorted() );
+          makeAnnotation(tcas, start, endToken.getEnd(), dictEntry.getProperties(), spanAnnotation,
+                  dictEntry.getUnsorted(), tokens.subList(whichToken, whichToken + entryLength),
+                  logger);
+
+          updateTokenAnnotations(tokens, whichToken, entryLength, dictEntry);
+          if (!findAllMatches) {
+            entryFound = true;
+          }
+        }
+      }
+      entryLength--;
+    }
+    if (!entryFound) {
+      entryLength = 0;
+    }
+    return entryLength;
+  }
+
+  /**
+   * update token annotations with value stored in dictionary for feature provided by
+   * tokenClassFeatureName
+   * 
+   * @param tokens
+   * @param whichToken
+   * @param entryLength
+   * @param dictEntry
+   */
+  private void updateTokenAnnotations(ArrayList<AnnotationFS> tokens, int whichToken,
+          int entryLength, DictEntry dictEntry) {
+    if (tokenClassWriteBackFeatures != null) {
+      for (int feature = 0; feature < tokenClassWriteBackFeatures.length; feature++) {
+        if (tokenClassWriteBackFeatures[feature] != null) {
+          String propVal = dictEntry.getProperties().getProperty(
+                  tokenClassWriteBackFeatureNames[feature], "unknown");
+          // System.err.println ("propVal: " + ": " + propVal);
+          for (int i = whichToken; i < whichToken + entryLength; i++) {
+            AnnotationFS tokenToUpdate = tokens.get(i);
+            // System.err.println ("Token: " + tokenToUpdate.getText
+            // ());
+            tokenToUpdate.setStringValue(tokenClassWriteBackFeatures[feature], propVal);
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * @param start
+   * @param end
+   * @param properties
+   * @param matched
+   */
+  protected void makeAnnotation(CAS tcas, int start, int end, Properties properties,
+          Annotation spanAnnotation, String matchedText, Collection<AnnotationFS> matched,
+          Logger log) {
+    AnnotationFS annotation = tcas.createAnnotation(resultAnnotationType, start, end);
+    if (resultEnclosingSpan != null) {
+      annotation.setFeatureValue(resultEnclosingSpan, spanAnnotation);
+    }
+
+    if (resultMatchedTextFeature != null) {
+      annotation.setStringValue(resultMatchedTextFeature, matchedText);
+    }
+
+    if (matchedTokensFeature != null) {
+      FSArray matchedTokens = new FSArray(getJCas(), matched.size());
+      FeatureStructure[] featureStructArray = new FeatureStructure[matched.size()];
+      matched.toArray(featureStructArray);
+      matchedTokens.copyFromArray(featureStructArray, 0, 0, featureStructArray.length);
+      annotation.setFeatureValue(matchedTokensFeature, matchedTokens);
+      /*
+       * FSArray tmp = (FSArray) annotation.getFeatureValue (matchedTokensFeature); FeatureStructure []
+       * tmpfs = tmp.toArray (); System.err.println ("FSArray: begin"); for (int i = 0; i <
+       * tmpfs.length; i++) { System.err.println (((Annotation) tmpfs[i]).getCoveredText ()); }
+       * System.err.println ("FSArray: done");
+       */
+    }
+
+    for (int featIndex = 0; featIndex < features.length; featIndex++) {
+      if (features[featIndex] != null) {
+        annotation.setStringValue(features[featIndex], properties.getProperty(
+                attributeNames[featIndex], "unknown"));
+      } else {
+
+        // String message = "Feature '" + features[featIndex].getName() + "' not found in type '" +
+        // resultAnnotationName + "'";
+
+        String message = "Feature '" + featIndex + "' not found in type '" + resultAnnotationName
+                + "'";
+        // System.err.println(message);
+
+        log.logWarning(message);
+      }
+    }
+
+    tcas.getIndexRepository().addFS(annotation);
+  }
+
+  /**
+   * @param entries
+   * @param tokensToMatch
+   * @return
+   */
+  private DictEntry findMatchingEntry(ArrayList<DictionaryResource.DictEntry> entries,
+          String tokensToMatch) {
+    // System.err.println("Searching for: '" + tokensToMatch + "'");
+
+    for (int i = 0; i < entries.size(); i++) {
+      DictionaryResource.DictEntry dictEntry = entries.get(i);
+      String entryText = dictEntry.getText();
+
+      // System.err.println("--> trying: '" + entryText + "'");
+
+      if (entryText.equals(tokensToMatch)) {
+        return dictEntry;
+      }
+    }
+    return null;
+  }
+
+  /**
+   * @param tokens
+   * @param length
+   * @return
+   */
+  private String buildTokenString(ArrayList<AnnotationFS> tokens, int startIndex, int length,
+          boolean sortElements) {
+    String[] elements = new String[length];
+    for (int i = startIndex; i < length + startIndex; i++) {
+      AnnotationFS token = tokens.get(i);
+      elements[i - startIndex] = tokenNormalizer.normalize(getTokenText(token));
+    }
+
+    if (sortElements) {
+      Arrays.sort(elements);
+    }
+
+    StringBuffer result = new StringBuffer();
+    for (int i = 0; i < elements.length; i++) {
+      if (result.length() != 0) {
+        result.append(" ");
+      }
+      result.append(elements[i]);
+    }
+    return result.toString();
+  }
+
+  private String getTokenText(AnnotationFS token) {
+    if (tokenTextFeature == null) {
+      return token.getCoveredText();
+    } else {
+      return token.getStringValue(tokenTextFeature);
+    }
+  }
+}

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,152 @@
+
+
+/* First created by JCasGen Sat Dec 22 14:05:15 EST 2007 */
+package org.apache.uima.conceptMapper;
+
+import org.apache.uima.jcas.JCas; 
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.cas.FSArray;
+
+
+/** Annotation for dictionary lookup matches
+ * Updated by JCasGen Tue Mar 25 15:43:47 EDT 2008
+ * XML source: /OtherStuff/IBM/eclipse-apacheuima/conceptMapper/src/org/apache/uima/conceptMapper/DictTerm.xml
+ * @generated */
+public class DictTerm extends Annotation {
+  /** @generated
+   * @ordered 
+   */
+  public final static int typeIndexID = JCasRegistry.register(DictTerm.class);
+  /** @generated
+   * @ordered 
+   */
+  public final static int type = typeIndexID;
+  /** @generated  */
+  public              int getTypeIndexID() {return typeIndexID;}
+ 
+  /** Never called.  Disable default constructor
+   * @generated */
+  protected DictTerm() {}
+    
+  /** Internal - constructor used by generator 
+   * @generated */
+  public DictTerm(int addr, TOP_Type type) {
+    super(addr, type);
+    readObject();
+  }
+  
+  /** @generated */
+  public DictTerm(JCas jcas) {
+    super(jcas);
+    readObject();   
+  } 
+
+  /** @generated */  
+  public DictTerm(JCas jcas, int begin, int end) {
+    super(jcas);
+    setBegin(begin);
+    setEnd(end);
+    readObject();
+  }   
+
+  /** <!-- begin-user-doc -->
+    * Write your own initialization here
+    * <!-- end-user-doc -->
+  @generated modifiable */
+  private void readObject() {}
+     
+ 
+    
+  //*--------------*
+  //* Feature: DictCanon
+
+  /** getter for DictCanon - gets canonical form
+   * @generated */
+  public String getDictCanon() {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_DictCanon == null)
+      jcasType.jcas.throwFeatMissing("DictCanon", "org.apache.uima.conceptMapper.DictTerm");
+    return jcasType.ll_cas.ll_getStringValue(addr, ((DictTerm_Type)jcasType).casFeatCode_DictCanon);}
+    
+  /** setter for DictCanon - sets canonical form 
+   * @generated */
+  public void setDictCanon(String v) {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_DictCanon == null)
+      jcasType.jcas.throwFeatMissing("DictCanon", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.ll_cas.ll_setStringValue(addr, ((DictTerm_Type)jcasType).casFeatCode_DictCanon, v);}    
+   
+    
+  //*--------------*
+  //* Feature: enclosingSpan
+
+  /** getter for enclosingSpan - gets span that this NoTerm is contained within (i.e. its sentence)
+   * @generated */
+  public Annotation getEnclosingSpan() {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_enclosingSpan == null)
+      jcasType.jcas.throwFeatMissing("enclosingSpan", "org.apache.uima.conceptMapper.DictTerm");
+    return (Annotation)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_enclosingSpan)));}
+    
+  /** setter for enclosingSpan - sets span that this NoTerm is contained within (i.e. its sentence) 
+   * @generated */
+  public void setEnclosingSpan(Annotation v) {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_enclosingSpan == null)
+      jcasType.jcas.throwFeatMissing("enclosingSpan", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.ll_cas.ll_setRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_enclosingSpan, jcasType.ll_cas.ll_getFSRef(v));}    
+   
+    
+  //*--------------*
+  //* Feature: matchedText
+
+  /** getter for matchedText - gets 
+   * @generated */
+  public String getMatchedText() {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedText == null)
+      jcasType.jcas.throwFeatMissing("matchedText", "org.apache.uima.conceptMapper.DictTerm");
+    return jcasType.ll_cas.ll_getStringValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedText);}
+    
+  /** setter for matchedText - sets  
+   * @generated */
+  public void setMatchedText(String v) {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedText == null)
+      jcasType.jcas.throwFeatMissing("matchedText", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.ll_cas.ll_setStringValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedText, v);}    
+   
+    
+  //*--------------*
+  //* Feature: matchedTokens
+
+  /** getter for matchedTokens - gets 
+   * @generated */
+  public FSArray getMatchedTokens() {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedTokens == null)
+      jcasType.jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    return (FSArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens)));}
+    
+  /** setter for matchedTokens - sets  
+   * @generated */
+  public void setMatchedTokens(FSArray v) {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedTokens == null)
+      jcasType.jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.ll_cas.ll_setRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens, jcasType.ll_cas.ll_getFSRef(v));}    
+    
+  /** indexed getter for matchedTokens - gets an indexed value - 
+   * @generated */
+  public TOP getMatchedTokens(int i) {
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedTokens == null)
+      jcasType.jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens), i);
+    return (TOP)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens), i)));}
+
+  /** indexed setter for matchedTokens - sets an indexed value - 
+   * @generated */
+  public void setMatchedTokens(int i, TOP v) { 
+    if (DictTerm_Type.featOkTst && ((DictTerm_Type)jcasType).casFeat_matchedTokens == null)
+      jcasType.jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens), i);
+    jcasType.ll_cas.ll_setRefArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((DictTerm_Type)jcasType).casFeatCode_matchedTokens), i, jcasType.ll_cas.ll_getFSRef(v));}
+  }
+
+    
\ No newline at end of file

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.xml?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.xml (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm.xml Mon Jun 23 05:31:40 2008
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one
+	or more contributor license agreements.  See the NOTICE file
+	distributed with this work for additional information
+	regarding copyright ownership.  The ASF licenses this file
+	to you under the Apache License, Version 2.0 (the
+	"License"); you may not use this file except in compliance
+	with the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing,
+	software distributed under the License is distributed on an
+	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+	KIND, either express or implied.  See the License for the
+	specific language governing permissions and limitations
+	under the License.    
+-->
+<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
+<name>DictTerm</name>
+<version>1</version>
+<vendor></vendor>
+<types>
+<typeDescription>
+<name>org.apache.uima.conceptMapper.DictTerm</name>
+<description>Annotation for dictionary lookup matches</description>
+<supertypeName>uima.tcas.Annotation</supertypeName>
+<features>
+<featureDescription>
+<name>DictCanon</name>
+<description>canonical form</description>
+<rangeTypeName>uima.cas.String</rangeTypeName>
+</featureDescription>
+<featureDescription>
+<name>enclosingSpan</name>
+<description>span that this NoTerm is contained within (i.e. its sentence)</description>
+<rangeTypeName>uima.tcas.Annotation</rangeTypeName>
+</featureDescription>
+<featureDescription>
+<name>matchedText</name>
+<description></description>
+<rangeTypeName>uima.cas.String</rangeTypeName>
+</featureDescription>
+<featureDescription>
+<name>matchedTokens</name>
+<description></description>
+<rangeTypeName>uima.cas.FSArray</rangeTypeName>
+</featureDescription>
+</features>
+</typeDescription>
+</types>
+</typeSystemDescription>

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm_Type.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm_Type.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm_Type.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/DictTerm_Type.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,165 @@
+
+
+/* First created by JCasGen Tue Jul 19 09:44:40 EDT 2005 */
+package org.apache.uima.conceptMapper;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.tcas.Annotation_Type;
+
+/** Annotation for dictionary lookup matches
+ * Updated by JCasGen Tue Mar 25 15:43:47 EDT 2008
+ * @generated */
+public class DictTerm_Type extends Annotation_Type {
+  /** @generated */
+  protected FSGenerator getFSGenerator() {return fsGenerator;}
+  /** @generated */
+  private final FSGenerator fsGenerator = 
+    new FSGenerator() {
+      public FeatureStructure createFS(int addr, CASImpl cas) {
+  			 if (DictTerm_Type.this.useExistingInstance) {
+  			   // Return eq fs instance if already created
+  		     FeatureStructure fs = DictTerm_Type.this.jcas.getJfsFromCaddr(addr);
+  		     if (null == fs) {
+  		       fs = new DictTerm(addr, DictTerm_Type.this);
+  			   DictTerm_Type.this.jcas.putJfsFromCaddr(addr, fs);
+  			   return fs;
+  		     }
+  		     return fs;
+        } else return new DictTerm(addr, DictTerm_Type.this);
+  	  }
+    };
+  /** @generated */
+  public final static int typeIndexID = DictTerm.typeIndexID;
+  /** @generated 
+     @modifiable */
+  public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.conceptMapper.DictTerm");
+ 
+  /** @generated */
+  final Feature casFeat_DictCanon;
+  /** @generated */
+  final int     casFeatCode_DictCanon;
+  /** @generated */ 
+  public String getDictCanon(int addr) {
+        if (featOkTst && casFeat_DictCanon == null)
+      jcas.throwFeatMissing("DictCanon", "org.apache.uima.conceptMapper.DictTerm");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_DictCanon);
+  }
+  /** @generated */    
+  public void setDictCanon(int addr, String v) {
+        if (featOkTst && casFeat_DictCanon == null)
+      jcas.throwFeatMissing("DictCanon", "org.apache.uima.conceptMapper.DictTerm");
+    ll_cas.ll_setStringValue(addr, casFeatCode_DictCanon, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_enclosingSpan;
+  /** @generated */
+  final int     casFeatCode_enclosingSpan;
+  /** @generated */ 
+  public int getEnclosingSpan(int addr) {
+        if (featOkTst && casFeat_enclosingSpan == null)
+      jcas.throwFeatMissing("enclosingSpan", "org.apache.uima.conceptMapper.DictTerm");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_enclosingSpan);
+  }
+  /** @generated */    
+  public void setEnclosingSpan(int addr, int v) {
+        if (featOkTst && casFeat_enclosingSpan == null)
+      jcas.throwFeatMissing("enclosingSpan", "org.apache.uima.conceptMapper.DictTerm");
+    ll_cas.ll_setRefValue(addr, casFeatCode_enclosingSpan, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_matchedText;
+  /** @generated */
+  final int     casFeatCode_matchedText;
+  /** @generated */ 
+  public String getMatchedText(int addr) {
+        if (featOkTst && casFeat_matchedText == null)
+      jcas.throwFeatMissing("matchedText", "org.apache.uima.conceptMapper.DictTerm");
+    return ll_cas.ll_getStringValue(addr, casFeatCode_matchedText);
+  }
+  /** @generated */    
+  public void setMatchedText(int addr, String v) {
+        if (featOkTst && casFeat_matchedText == null)
+      jcas.throwFeatMissing("matchedText", "org.apache.uima.conceptMapper.DictTerm");
+    ll_cas.ll_setStringValue(addr, casFeatCode_matchedText, v);}
+    
+  
+ 
+  /** @generated */
+  final Feature casFeat_matchedTokens;
+  /** @generated */
+  final int     casFeatCode_matchedTokens;
+  /** @generated */ 
+  public int getMatchedTokens(int addr) {
+        if (featOkTst && casFeat_matchedTokens == null)
+      jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    return ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens);
+  }
+  /** @generated */    
+  public void setMatchedTokens(int addr, int v) {
+        if (featOkTst && casFeat_matchedTokens == null)
+      jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    ll_cas.ll_setRefValue(addr, casFeatCode_matchedTokens, v);}
+    
+   /** @generated */
+  public int getMatchedTokens(int addr, int i) {
+        if (featOkTst && casFeat_matchedTokens == null)
+      jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    if (lowLevelTypeChecks)
+      return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i);
+  return ll_cas.ll_getRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i);
+  }
+   
+  /** @generated */ 
+  public void setMatchedTokens(int addr, int i, int v) {
+        if (featOkTst && casFeat_matchedTokens == null)
+      jcas.throwFeatMissing("matchedTokens", "org.apache.uima.conceptMapper.DictTerm");
+    if (lowLevelTypeChecks)
+      ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i, v, true);
+    jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i);
+    ll_cas.ll_setRefArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_matchedTokens), i, v);
+  }
+ 
+
+
+
+  /** initialize variables to correspond with Cas Type and Features
+	* @generated */
+  public DictTerm_Type(JCas jcas, Type casType) {
+    super(jcas, casType);
+    casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+ 
+    casFeat_DictCanon = jcas.getRequiredFeatureDE(casType, "DictCanon", "uima.cas.String", featOkTst);
+    casFeatCode_DictCanon  = (null == casFeat_DictCanon) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_DictCanon).getCode();
+
+ 
+    casFeat_enclosingSpan = jcas.getRequiredFeatureDE(casType, "enclosingSpan", "uima.tcas.Annotation", featOkTst);
+    casFeatCode_enclosingSpan  = (null == casFeat_enclosingSpan) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_enclosingSpan).getCode();
+
+ 
+    casFeat_matchedText = jcas.getRequiredFeatureDE(casType, "matchedText", "uima.cas.String", featOkTst);
+    casFeatCode_matchedText  = (null == casFeat_matchedText) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_matchedText).getCode();
+
+ 
+    casFeat_matchedTokens = jcas.getRequiredFeatureDE(casType, "matchedTokens", "uima.cas.FSArray", featOkTst);
+    casFeatCode_matchedTokens  = (null == casFeat_matchedTokens) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_matchedTokens).getCode();
+
+  }
+}
+
+
+
+    
\ No newline at end of file

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/Logger.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/Logger.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/Logger.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/Logger.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper;
+
+import org.apache.uima.util.Level;
+
+public class Logger {
+  private org.apache.uima.util.Logger logger;
+
+  private String prefix;
+
+  public Logger(String prefix, org.apache.uima.util.Logger logger) {
+    super();
+    this.prefix = prefix;
+    this.logger = logger;
+  }
+
+  public Logger(org.apache.uima.util.Logger logger) {
+    super();
+    this.prefix = "";
+    this.logger = logger;
+  }
+
+  private void log(Level level, String message) {
+    String logMessage = prefix + " " + level.toString() + ": " + message;
+
+    if (logger == null) {
+      System.err.println(logMessage);
+    } else {
+      logger.log(level, logMessage);
+    }
+  }
+
+  public void logError(String message) {
+    log(Level.SEVERE, message);
+  }
+
+  public void logWarning(String message) {
+    log(Level.WARNING, message);
+  }
+
+  public void logInfo(String message) {
+    log(Level.INFO, message);
+  }
+
+  public void logFine(String message) {
+    log(Level.FINE, message);
+  }
+
+  public void logFinest(String message) {
+    log(Level.FINEST, message);
+  }
+}

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/dictionaryCompiler/CompileDictionary.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/dictionaryCompiler/CompileDictionary.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/dictionaryCompiler/CompileDictionary.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/dictionaryCompiler/CompileDictionary.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.dictionaryCompiler;
+
+import java.io.FileOutputStream;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryResource_impl;
+import org.apache.uima.util.XMLInputSource;
+
+public class CompileDictionary {
+  private static final String DICTIONARY_RESOURCE_NAME = "/DictionaryFile";
+
+  public static void main(String[] args) throws Exception {
+    AnalysisEngineDescription conceptMapperDesc = UIMAFramework.getXMLParser()
+            .parseAnalysisEngineDescription(new XMLInputSource(args[0]));
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(conceptMapperDesc);
+    DictionaryResource_impl dict = (DictionaryResource_impl) ae.getResourceManager().getResource(
+            DICTIONARY_RESOURCE_NAME);
+
+    FileOutputStream output = new FileOutputStream(args[1]);
+    dict.serializeEntries(output);
+    output.close();
+    ae.destroy();
+    // for some reason JVM won't exit normally,
+    // probably because CPM threads are alive?
+    System.exit(0);
+  }
+}

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/CompiledDictionaryResource_impl.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/CompiledDictionaryResource_impl.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/CompiledDictionaryResource_impl.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/CompiledDictionaryResource_impl.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.support.dictionaryResource;
+
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.Properties;
+
+import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
+import org.apache.uima.conceptMapper.Logger;
+import org.apache.uima.resource.DataResource;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.SharedResourceObject;
+/**
+ * Implementation of a UIMA DictionaryResource
+ */
+
+public class CompiledDictionaryResource_impl implements DictionaryResource, SharedResourceObject {
+  /**
+   * Hashtable of first words. Contains a DictEntries object keyed on word string for the first word
+   * of every entry in the specified dictionary.
+   */
+  protected Hashtable<String, DictEntriesByLength> dictImpl;
+
+  public DictionaryResource NewDictionaryResource(int initialSize) {
+    throw new UnsupportedOperationException();
+  }
+
+  public DictEntriesByLength getEntries(String key) {
+    return dictImpl.get(key);
+  }
+
+  public boolean isLoaded() {
+    return true;
+  }
+
+  public Enumeration<String> keys() {
+    return dictImpl.keys();
+  }
+
+  @SuppressWarnings("unchecked")
+  public void load(DataResource data) throws ResourceInitializationException {
+    try {
+      ObjectInputStream ois = new ObjectInputStream(data.getInputStream());
+      dictImpl = (Hashtable) ois.readObject();
+      ois.close();
+    } catch (IOException e) {
+      throw new ResourceInitializationException(e);
+    } catch (ClassNotFoundException e) {
+      throw new ResourceInitializationException(e);
+    }
+  }
+
+  public void loadDictionaryContents(AnnotatorContext context, Logger logger,
+          String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
+          String tokenizerDescriptor) throws ResourceInitializationException {
+    // nothing to do
+  }
+
+  public void putEntry(String key, String entry, ArrayList<String> tokens, String unsortedEntry,
+          int length, Properties props) {
+    throw new UnsupportedOperationException();
+  }
+
+}
\ No newline at end of file

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryLoaderException.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryLoaderException.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryLoaderException.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryLoaderException.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.support.dictionaryResource;
+
+public class DictionaryLoaderException extends Exception {
+
+  /**
+   * 
+   */
+  private static final long serialVersionUID = -8996670807380390034L;
+
+  public DictionaryLoaderException() {
+    super();
+  }
+
+  public DictionaryLoaderException(Throwable cause) {
+    super(cause);
+  }
+
+}

Added: incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource.java?rev=670541&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource.java (added)
+++ incubator/uima/sandbox/trunk/ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/dictionaryResource/DictionaryResource.java Mon Jun 23 05:31:40 2008
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.conceptMapper.support.dictionaryResource;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Enumeration;
+import java.util.Properties;
+
+import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
+import org.apache.uima.conceptMapper.Logger;
+import org.apache.uima.resource.DataResource;
+import org.apache.uima.resource.ResourceInitializationException;
+
+/*
+ * Interface for external UIMA dictionary resource
+ */
+
+public interface DictionaryResource {
+  // shared resource loader
+  public void load(DataResource data) throws ResourceInitializationException;
+
+  public DictionaryResource NewDictionaryResource(int initialSize);
+
+  public interface DictEntriesByLength extends Serializable {
+    public DictEntries getEntries(int length);
+
+    void putEntry(int length, String entry, ArrayList<String> elements, String unsorted,
+            Properties props);
+
+    public Integer getLongest();
+
+    public Integer getShortest();
+  }
+
+  public interface DictEntries extends Serializable {
+
+    /**
+     * @param elements
+     * @param unsorted
+     * @param key
+     * @param length
+     * @param props
+     */
+    void putEntry(String entry, Collection<String> elements, String unsorted, Properties props);
+
+    /**
+     * @param string
+     * @return
+     */
+    ArrayList<DictEntry> getEntries();
+
+    public String toString();
+  }
+
+  public interface DictEntry extends Serializable {
+    public String getText();
+
+    public void setElements(Collection<String> elements);
+
+    public Collection<String> getElements();
+
+    public void setText(String text);
+
+    public String getUnsorted();
+
+    public void setUnsorted(String text);
+
+    public Properties getProperties();
+
+    public void setProperties(Properties props);
+
+    public String toString();
+
+  }
+
+  /**
+   * return data structure containing a list of dictionary entries, sorted by number of tokens
+   * 
+   * @param key
+   * @return
+   */
+  public DictEntriesByLength getEntries(String key);
+
+  /**
+   * @param key
+   *          the key to index on
+   * @param entry
+   *          String representation of tokens to be entered in the dictionary
+   * @param tokens
+   *          array of tokens to be entered in the dictionary
+   * @param unsortedEntry
+   *          String representation of tokens to be entered in the dictionary in sorted order, if
+   *          "entry" is sorted, otherwise null
+   * @param length
+   *          Number of tokens in entry
+   * @param props
+   *          the properties object for the dictionary entry
+   */
+  public void putEntry(String key, String entry, ArrayList<String> tokens, String unsortedEntry,
+          int length, Properties props);
+
+  /**
+   * @return
+   */
+  public Enumeration<String> keys();
+
+  public String toString();
+
+  /**
+   * @param context
+   * @param tokenizerDescriptor
+   * @param tokenAnnotationName
+   * @throws ResourceInitializationException
+   */
+  public void loadDictionaryContents(AnnotatorContext context, Logger logger,
+          String tokenAnnotationName, String tokenTypeFeatureName, String tokenClassFeatureName,
+          String tokenizerDescriptor) throws ResourceInitializationException;
+
+  public boolean isLoaded();
+}