You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2009/05/30 13:13:01 UTC
svn commit: r780236 [2/6] - in /incubator/uima/sandbox/trunk/Lucas: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/uima/ src/main/java/org/apache/uima/lucas/ src/main/java/org/apache/uima/lucas/co...

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java Sat May 30 11:12:58 2009
@@ -0,0 +1,574 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.text.Format;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterators;
+
+/**
+ * 
+ * AnnotationTokenStream represents a TokenStream which extracts tokens from feature values of
+ * annotations of a given type from a JCas object. Each token has the start and end offset from the
+ * annotation object. This class supports only the following UIMA JCas types of features:
+ * <ol>
+ * <li>String</li>
+ * <li>StringArray</li>
+ * <li>FSArray</li>
+ * <li>Number types</li>
+ * </ol>
+ * 
+ * @author landefeld
+ * @version 0.2
+ */
+public class AnnotationTokenStream extends TokenStream {
+
+  private JCas jCas;
+
+  private String featurePath;
+
+  private List<String> featureNames;
+
+  private String delimiter;
+
+  private Iterator<Annotation> annotationIterator; // iterates over annotations
+
+  private Iterator<FeatureStructure> featureStructureIterator; // iterates over feature structures
+
+  // stored in feature arrays of an
+  // annotation
+
+  private Iterator<String> featureValueIterator; // iterates over the features of a feature
+
+  // structure
+
+  private Annotation currentAnnotation;
+
+  private Type annotationType;
+
+  private Map<String, Format> featureFormats; // a optional map of format object for each feature
+
+  private static Logger logger = Logger.getLogger(AnnotationTokenStream.class);
+
+  private class NotNullPredicate<T> implements Predicate<T> {
+
+    public boolean apply(T object) {
+      return object != null;
+    }
+  }
+
+  /**
+   * Creates a TokenStream which extracts all coveredText feature values of annotations of a given
+   * type from a JCas object. Each token has the start and end offset of the annotation and takes
+   * the covered text value as termText.
+   * 
+   * @param jCas
+   *          the jCas
+   * @param typeName
+   *          the type of the annotation
+   * @throws CASException
+   */
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName) throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featureNames = Collections.EMPTY_LIST;
+    this.featureFormats = Collections.EMPTY_MAP;
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+  }
+
+  /**
+   * Creates a TokenStream which extracts all feature values of a given feature name from
+   * annotations with a given type from a given JCas object. Each token has the start and end offset
+   * of the annotation and uses the feature value as term text.
+   * 
+   * @param jCas
+   *          the JCas object
+   * @param type
+   *          the type of the annotation
+   * @param featureName
+   *          the name of the feature from which the token text is build
+   * @param featureFormat
+   *          optional format object to convert feature values to strings
+   * @throws CASException
+   */
+
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featureName,
+          Format featureFormat) throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featureNames = new ArrayList<String>();
+    if (featureFormat != null) {
+      featureFormats = new HashMap<String, Format>();
+      featureFormats.put(featureName, featureFormat);
+    } else
+      this.featureFormats = Collections.EMPTY_MAP;
+
+    featureNames.add(featureName);
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+
+  }
+
+  /**
+   * Creates a TokenStream which extracts all feature values of a given feature name list from
+   * annotations with a given type from a given JCas object. Each token has the start and end offset
+   * of the annotation and uses the concatenation of all the feature values as term text. Optionally
+   * the different feature values of an annotation can be concatenated with a delimiter.
+   * 
+   * @param jCas
+   *          the JCas object
+   * @param type
+   *          the type of the annotation
+   * @param featureNames
+   *          the name of the feature from which the token text is build
+   * @param delimiter
+   *          a delimiter for concatenating the different feature values of an annotation object. If
+   *          null a white space will be used.
+   * @param featureFormats
+   *          optional map of format objects to convert feature values to strings - the key must be
+   *          the feature name
+   * @throws CASException
+   */
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName,
+          List<String> featureNames, String delimiter, Map<String, Format> featureFormats)
+          throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featureNames = featureNames;
+    this.delimiter = delimiter;
+
+    if (featureFormats == null)
+      this.featureFormats = Collections.EMPTY_MAP;
+    else
+      this.featureFormats = featureFormats;
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+  }
+
+  /**
+   * Creates a TokenStream which extracts all feature values of a given feature name list from
+   * annotations with a given type from a given JCas object. Each token has the start and end offset
+   * of the annotation and uses the concatenation of all the feature values as term text.
+   * 
+   * @param jCas
+   *          the JCas object
+   * @param type
+   *          the type of the annotation
+   * @param featureNames
+   *          the name of the feature from which the token text is build
+   * @param featureFormats
+   *          optional map of format objects to convert feature values to strings - the key must be
+   *          the feature name
+   * @throws CASException
+   */
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName,
+          List<String> featureNames, Map<String, Format> featureFormats) throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featureNames = featureNames;
+    if (featureFormats == null)
+      this.featureFormats = Collections.EMPTY_MAP;
+    else
+      this.featureFormats = featureFormats;
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+  }
+
+  /**
+   * Creates a TokenStream which extracts all feature values of a given feature name list from
+   * annotations with a given type from a given JCas object. The addressed features are part of
+   * direct or indirect feature structure value of a annotation. For example a annotation of type
+   * person has a feature address which values are address feature structures with features for the
+   * street, postal code and city . To create tokens with postal code and city of a persons address,
+   * the featurePath must be &quot;address&quot; and the featureNames &quot;postalCode&quot; and
+   * &quot;city&quot;. Each token has the start and end offset of the annotation and uses the
+   * concatenation of all the feature values as term text.
+   * 
+   * @param jCas
+   *          the JCas object
+   * @param type
+   *          the type of the annotation
+   * @param featurePath
+   *          the path to the feature structures which features should be used for tokens Path
+   *          entries should be separated by &quot;.&quot;. Example:
+   *          &quot;affiliation.address.country&quot;
+   * @param featureNames
+   *          the name of the feature from which the token text is build
+   * @param delimiter
+   *          a delimiter for concatenating the different feature values of an annotation object. If
+   *          null a white space will be used.
+   * @param featureFormats
+   *          optional map of format objects to convert feature values to strings - the key must be
+   *          the feature name
+   * @throws CASException
+   */
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featurePath,
+          List<String> featureNames, Map<String, Format> featureFormats) throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featurePath = featurePath;
+    this.featureNames = featureNames;
+    if (featureFormats == null)
+      this.featureFormats = Collections.EMPTY_MAP;
+    else
+      this.featureFormats = featureFormats;
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featurePath: " + featurePath);
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+  }
+
+  /**
+   * Creates a TokenStream which extracts all feature values of a given feature name list from
+   * annotations with a given type from a given JCas object. The addressed features are part of
+   * direct or indirect feature structure value of a annotation. For example a annotation of type
+   * person has a feature address which values are address feature structures with features for the
+   * street, postal code and city . To create tokens with postal code and city of a persons address,
+   * the featurePath must be &quot;address&quot; and the featureNames &quot;postalCode&quot; and
+   * &quot;city&quot;. Each token has the start and end offset of the annotation and uses the
+   * concatenation of all the feature values as term text. Optionally the different feature values
+   * of an annotation can be concatenated with a delimiter.
+   * 
+   * @param jCas
+   *          the JCas object
+   * @param type
+   *          the type of the annotation
+   * @param featurePath
+   *          the path to the feature structures which features should be used for tokens Path
+   *          entries should be separated by &quot;.&quot;. Example:
+   *          &quot;affiliation.address.country&quot;
+   * @param featureNames
+   *          the name of the feature from which the token text is build
+   * @param delimiter
+   *          a delimiter for concatenating the different feature values of an annotation object. If
+   *          null a white space will be used.
+   * @param featureFormats
+   *          optional map of format objects to convert feature values to strings - the key must be
+   *          the feature name
+   * @throws CASException
+   */
+  public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featurePath,
+          List<String> featureNames, String delimiter, Map<String, Format> featureFormats)
+          throws CASException {
+    super();
+    jCas = cas.getView(sofaName);
+    this.featurePath = featurePath;
+    this.featureNames = featureNames;
+    this.delimiter = delimiter;
+    if (featureFormats == null)
+      this.featureFormats = Collections.EMPTY_MAP;
+    else
+      this.featureFormats = featureFormats;
+
+    try {
+      annotationType = jCas.getTypeSystem().getType(typeName);
+      logger.debug(typeName + ", found: " + (annotationType != null));
+      logger.debug("featurePath: " + featurePath);
+      logger.debug("featureNames: " + featureNames);
+      initializeIterators();
+    } catch (Exception e) {
+      IllegalArgumentException exc =
+              new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+      exc.initCause(e);
+      throw exc;
+    }
+
+  }
+
+  @Override
+  public Token next(Token token) throws IOException {
+    try {
+      while (!featureValueIterator.hasNext()) {
+        while (!featureStructureIterator.hasNext()) {
+          if (!annotationIterator.hasNext())
+            return null;
+          currentAnnotation = (Annotation) annotationIterator.next();
+          featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath);
+        }
+
+        featureValueIterator =
+                createFeatureValueIterator(featureStructureIterator.next(), featureNames);
+      }
+
+      token.setStartOffset(currentAnnotation.getBegin());
+      token.setEndOffset(currentAnnotation.getEnd());
+
+      char[] value = featureValueIterator.next().toCharArray();
+      token.setTermBuffer(value, 0, value.length);
+      return token;
+
+    } catch (Throwable e) {
+
+      IOException ioException =
+              new IOException(e + " at type " + annotationType.getName() + " features "
+                      + featureNames + " featurePath " + featurePath + " sofa "
+                      + jCas.getViewName(), e);
+      logger.error(ioException);
+      throw ioException;
+    }
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.apache.lucene.analysis.TokenStream#next()
+   */
+  @Override
+  public Token next() throws IOException {
+    return next(new Token());
+  }
+
+  protected void initializeIterators() {
+    annotationIterator =
+            Iterators.filter(jCas.getAnnotationIndex(annotationType).iterator(),
+                    new NotNullPredicate<Annotation>());
+
+    if (!annotationIterator.hasNext()) {
+      featureStructureIterator = Iterators.emptyIterator();
+      featureValueIterator = Iterators.emptyIterator();
+      return;
+    }
+
+    currentAnnotation = (Annotation) annotationIterator.next();
+    featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath);
+    if (!featureStructureIterator.hasNext()) {
+      featureValueIterator = Iterators.emptyIterator();
+      return;
+    }
+
+    FeatureStructure featureStructure = featureStructureIterator.next();
+    featureValueIterator = createFeatureValueIterator(featureStructure, featureNames);
+  }
+
+  protected Iterator<FeatureStructure> createFeatureStructureIterator(Annotation annotation,
+          String featurePath) {
+    Collection<FeatureStructure> featureStructures = new LinkedList<FeatureStructure>();
+    Collection<FeatureStructure> childs = new LinkedList<FeatureStructure>();
+
+    if (featurePath == null) {
+      featureStructures.add(annotation);
+      return featureStructures.iterator();
+    }
+
+    Type currentType = annotation.getType();
+    if (currentType.isArray())
+      currentType = currentType.getComponentType();
+
+    String[] pathEntries = featurePath.split("\\.");
+    featureStructures.add(annotation);
+
+    for (String pathEntry : pathEntries) {
+      Feature feature = currentType.getFeatureByBaseName(pathEntry);
+      childs.clear();
+
+      if (feature.getRange().isArray()) {
+        for (FeatureStructure featureStructureItem : featureStructures) {
+          FSArray fsArray = (FSArray) featureStructureItem.getFeatureValue(feature);
+          if (fsArray == null)
+            continue;
+
+          for (int i = 0; i < fsArray.size(); i++)
+            childs.add(fsArray.get(i));
+        }
+      } else
+        for (FeatureStructure featureStructureItem : featureStructures)
+          childs.add(featureStructureItem.getFeatureValue(feature));
+
+      currentType = feature.getRange();
+      if (currentType.isArray())
+        currentType = currentType.getComponentType();
+
+      featureStructures.clear();
+      featureStructures.addAll(childs);
+    }
+
+    return Iterators.filter(featureStructures.iterator(), new NotNullPredicate<FeatureStructure>());
+  }
+
+  protected Iterator<String> createFeatureValueIterator(FeatureStructure srcFeatureStructure,
+          Collection<String> featureNames) {
+    List<String> values = new LinkedList<String>();
+    Type featureType = srcFeatureStructure.getType();
+
+    if (featureNames.size() == 0)
+      values.add(currentAnnotation.getCoveredText());
+
+    for (String featureName : featureNames) {
+      Feature feature = featureType.getFeatureByBaseName(featureName);
+      if (feature.getRange().isArray()) {
+        StringArray fsArray = (StringArray) srcFeatureStructure.getFeatureValue(feature);
+        if (featureNames.size() == 1) {
+          for (int i = 0; i < fsArray.size(); i++)
+            values.add(fsArray.get(i).toString());
+        } else {
+          String value = "";
+          for (int i = 0; i < fsArray.size(); i++) {
+            value = value.concat(fsArray.get(i).toString());
+            if (i < fsArray.size() - 1)
+              value = value.concat(delimiter);
+          }
+          values.add(value);
+        }
+      } else
+        values.add(getValueForFeature(srcFeatureStructure, feature, featureFormats.get(feature
+                .getShortName())));
+    }
+    String value = "";
+    if (delimiter != null) {
+      for (int i = 0; i < values.size(); i++) {
+        if (values.get(i) == null)
+          continue;
+
+        value = value.concat(values.get(i));
+        if (i < values.size() - 1)
+          value = value.concat(delimiter);
+      }
+      values.clear();
+      values.add(value);
+    }
+
+    return Iterators.filter(values.iterator(), new NotNullPredicate<String>());
+  }
+
+  public String getValueForFeature(FeatureStructure featureStructure, Feature feature, Format format) {
+    if (format == null)
+      return featureStructure.getFeatureValueAsString(feature);
+    else {
+      Object value = null;
+      if (feature.getRange().getName().equals(CAS.TYPE_NAME_DOUBLE))
+        value = featureStructure.getDoubleValue(feature);
+      else if (feature.getRange().getName().equals(CAS.TYPE_NAME_FLOAT))
+        value = featureStructure.getFloatValue(feature);
+      else if (feature.getRange().getName().equals(CAS.TYPE_NAME_LONG))
+        value = featureStructure.getLongValue(feature);
+      else if (feature.getRange().getName().equals(CAS.TYPE_NAME_INTEGER))
+        value = featureStructure.getIntValue(feature);
+      else if (feature.getRange().getName().equals(CAS.TYPE_NAME_SHORT))
+        value = featureStructure.getShortValue(feature);
+
+      return format.format(value);
+    }
+  }
+
+  public void reset() {
+    featureStructureIterator = null;
+    currentAnnotation = null;
+    featureFormats = Collections.EMPTY_MAP;
+    initializeIterators();
+  }
+
+  public Map<String, Format> getFeatureFormats() {
+    return featureFormats;
+  }
+
+  public JCas getJCas() {
+    return jCas;
+  }
+
+  public String getFeaturePath() {
+    return featurePath;
+  }
+
+  public List<String> getFeatureNames() {
+    return featureNames;
+  }
+
+  public String getDelimiter() {
+    return delimiter;
+  }
+
+  public Type getAnnotationType() {
+    return annotationType;
+  }
+
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java Sat May 30 11:12:58 2009
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharTokenizer;
+
+public class DelimiterTokenizer extends CharTokenizer {
+
+  private char delimiter;
+
+  public DelimiterTokenizer(Reader arg0, char delimiter) {
+    super(arg0);
+    this.delimiter = delimiter;
+  }
+
+  @Override
+  protected boolean isTokenChar(char c) {
+    return !(c == delimiter);
+  }
+
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * TokenFilter subclass which adds hypernyms to a TokenStream based on a map.
+ */
+public class HypernymTokenFilter extends TokenFilter {
+
+  private static Logger logger = Logger.getLogger(HypernymTokenFilter.class);
+
+  private Map<String, List<String>> hypernyms;
+
+  private TokenStream tokenStream;
+
+  private int currentHypernymIndex;
+
+  private List<String> currentHypernyms;
+
+  private Token inputToken;
+
+  /**
+   * Constructor.
+   * 
+   * @param input
+   *          the input TokenStream
+   * @param hypernyms
+   *          the hypernym map. key: token text, value: list of hypernyms
+   */
+  public HypernymTokenFilter(TokenStream input, Map<String, List<String>> hypernyms) {
+    super(input);
+    this.hypernyms = hypernyms;
+    this.tokenStream = input;
+    this.currentHypernymIndex = -1;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    if (currentHypernymIndex >= 0 && currentHypernymIndex < currentHypernyms.size()) {
+      Token hypernymToken =
+              new Token(currentHypernyms.get(currentHypernymIndex), inputToken.startOffset(),
+                      inputToken.endOffset());
+      hypernymToken.setPositionIncrement(0);
+      logger
+              .debug("adding hypernym " + hypernymToken.termText() + " for :"
+                      + inputToken.termText());
+      currentHypernymIndex++;
+      return hypernymToken;
+    } else if (currentHypernymIndex >= 0 && currentHypernymIndex == currentHypernyms.size()) {
+      currentHypernymIndex = -1;
+      currentHypernyms = null;
+    }
+
+    inputToken = tokenStream.next();
+    if (inputToken != null) {
+      currentHypernyms = hypernyms.get(inputToken.termText());
+      if (currentHypernyms != null)
+        currentHypernymIndex = 0;
+    }
+    return inputToken;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    inputToken = null;
+    currentHypernymIndex = -1;
+    if (currentHypernyms != null)
+      currentHypernyms = null;
+
+    tokenStream.reset();
+  }
+
+  public Map<String, List<String>> getHypernyms() {
+    return hypernyms;
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class PositionFilter extends TokenFilter {
+
+  public final static Integer FIRST_POSITION = 0;
+
+  public final static Integer LAST_POSITION = 1;
+
+  private TokenStream input;
+
+  private Token token;
+
+  private Integer position;
+
+  public PositionFilter(TokenStream input, Integer position) {
+    super(input);
+    this.input = input;
+    this.position = position;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    Token newToken = input.next();
+
+    if (position.equals(FIRST_POSITION)) {
+      if (token != null)
+        return null;
+      else {
+        token = newToken;
+        return newToken;
+      }
+    } else if (position.equals(LAST_POSITION)) {
+      Token lastToken = null;
+      while (newToken != null) {
+        lastToken = newToken;
+        newToken = input.next();
+      }
+      return lastToken;
+    } else
+      return newToken;
+  }
+
+  public Integer getPosition() {
+    return position;
+  }
+
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class ReplaceFilter extends TokenFilter {
+
+  private TokenStream input;
+
+  private Map<String, String> mapping;
+
+  public ReplaceFilter(TokenStream input, Map<String, String> mapping) {
+    super(input);
+    this.input = input;
+    this.mapping = mapping;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    Token token = input.next();
+    if (token == null)
+      return null;
+
+    String termText = new String(token.termBuffer(), 0, token.termLength());
+    String replaceText = mapping.get(termText);
+    if (replaceText != null)
+      token.setTermBuffer(replaceText.toCharArray(), 0, replaceText.length());
+
+    return token;
+  }
+
+  public Map<String, String> getMapping() {
+    return mapping;
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import com.google.common.collect.Iterators;
+
+public class SplitterFilter extends TokenFilter {
+
+  private String splitString;
+
+  private Iterator<String> splitIterator;
+
+  private Token currentToken;
+
+  public SplitterFilter(TokenStream input, String splitString) {
+    super(input);
+    this.splitString = splitString;
+    this.splitIterator = Iterators.emptyIterator();
+  }
+
+  @Override
+  public Token next(Token token) throws IOException {
+
+    if (!splitIterator.hasNext()) {
+      currentToken = input.next(token);
+      if (currentToken == null)
+        return null;
+
+      String tokenText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+      String[] splitts = tokenText.split(splitString);
+      splitIterator = Iterators.forArray(splitts, 0, splitts.length);
+    }
+
+    if (!splitIterator.hasNext())
+      return null;
+
+    token.setStartOffset(currentToken.startOffset());
+    token.setEndOffset(currentToken.endOffset());
+    char[] termBuffer = splitIterator.next().toCharArray();
+    token.setTermBuffer(termBuffer, 0, termBuffer.length);
+
+    return token;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    return next(new Token());
+  }
+
+  public String getSplitString() {
+    return splitString;
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java Sat May 30 11:12:58 2009
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenStreamStringConcatenator takes a {@link java.util.Collection Collection} of
+ * {@link org.apache.lucene.analysis.TokenStream tokenstreams} and concats them.
+ */
+public class TokenStreamConcatenator extends TokenStream {
+
+  private Collection<TokenStream> tokenStreams;
+
+  private Iterator<TokenStream> tokenStreamIterator;
+
+  private TokenStream currentTokenStream;
+
+  public TokenStreamConcatenator(Collection<TokenStream> tokenStreams) {
+    super();
+    this.tokenStreams = tokenStreams;
+    this.tokenStreamIterator = tokenStreams.iterator();
+
+    if (tokenStreamIterator.hasNext())
+      currentTokenStream = tokenStreamIterator.next();
+  }
+
+  @Override
+  public Token next() throws IOException {
+    if (currentTokenStream == null)
+      if (tokenStreamIterator.hasNext())
+        currentTokenStream = tokenStreamIterator.next();
+      else
+        return null;
+
+    Token nextToken = currentTokenStream.next();
+    while (nextToken == null) {
+      if (tokenStreamIterator.hasNext()) {
+        currentTokenStream = tokenStreamIterator.next();
+        nextToken = currentTokenStream.next();
+      } else
+        return null;
+    }
+
+    return nextToken;
+  }
+
+  public void reset() throws IOException {
+    for (TokenStream tokenStream : tokenStreams)
+      tokenStream.reset();
+
+    tokenStreamIterator = tokenStreams.iterator();
+
+    if (tokenStreamIterator.hasNext())
+      currentTokenStream = tokenStreamIterator.next();
+    else
+      currentTokenStream = null;
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java Sat May 30 11:12:58 2009
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenStreamMerger merges a {@link java.util.List list} of
+ * {@link org.apache.lucene.analysis.TokenStream token streams} by the means of their token offsets.
+ * Adapts positionIncrement of tokens if their startOffset is exactly the same.
+ */
+public class TokenStreamMerger extends TokenStream {
+
+  private class TokenComparator implements Comparator<Token> {
+
+    public int compare(Token token1, Token token2) {
+
+      return token2.startOffset() - token1.startOffset();
+
+    }
+  }
+
+  private Collection<TokenStream> streams;
+
+  private int currentOffset;
+
+  private TokenComparator comparator;
+
+  private Map<Token, TokenStream> currentTokens;
+
+  private Stack<Token> sortedTokens;
+
+  private boolean initialized;
+
+  public TokenStreamMerger(Collection<TokenStream> streams) throws IOException {
+    super();
+    this.streams = streams;
+    this.comparator = new TokenComparator();
+    currentTokens = new LinkedHashMap<Token, TokenStream>();
+    currentOffset = -1;
+    sortedTokens = new Stack<Token>();
+
+  }
+
+  private void init() throws IOException {
+    for (TokenStream stream : streams) {
+      Token token = stream.next();
+      if (token != null)
+        currentTokens.put(token, stream);
+    }
+    rebuildSortedTokens();
+    initialized = true;
+  }
+
+  public void reset() throws IOException {
+    currentTokens.clear();
+    for (TokenStream stream : streams)
+      stream.reset();
+
+    currentOffset = -1;
+    sortedTokens.clear();
+    initialized = false;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    if (!initialized)
+      init();
+
+    if (sortedTokens.size() == 0)
+      return null;
+
+    Token currentToken = sortedTokens.pop();
+    currentTokens.remove(currentToken);
+    rebuildSortedTokens();
+
+    if (currentToken.startOffset() == currentOffset)
+      currentToken.setPositionIncrement(0);
+    else
+      currentToken.setPositionIncrement(1);
+
+    currentOffset = currentToken.startOffset();
+
+    return currentToken;
+  }
+
+  private void rebuildSortedTokens() throws IOException {
+    for (TokenStream stream : streams)
+      if (!currentTokens.values().contains(stream)) {
+        Token token = stream.next();
+        if (token != null)
+          currentTokens.put(token, stream);
+      }
+
+    sortedTokens.clear();
+    sortedTokens.addAll(currentTokens.keySet());
+    Collections.sort(sortedTokens, comparator);
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A UniqueFilter filters multiple occurrences of {@link org.apache.lucene.analysis.Token tokens}
+ * tokens with the same token text and removes them.
+ */
+public class UniqueFilter extends TokenFilter {
+
+  private TokenStream input;
+
+  private Collection<String> termTexts;
+
+  private Token currentToken;
+
+  public UniqueFilter(TokenStream input) {
+    super(input);
+    this.input = input;
+    termTexts = new HashSet<String>();
+  }
+
+  @Override
+  public Token next() throws IOException {
+    currentToken = input.next();
+    if (currentToken == null)
+      return null;
+
+    String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+    while (termTexts.contains(termText)) {
+      currentToken = input.next();
+      if (currentToken != null)
+        termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+      else
+        termText = null;
+    }
+
+    if (currentToken == null)
+      return null;
+
+    termTexts.add(termText);
+    return currentToken;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    termTexts.clear();
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class UpperCaseTokenFilter extends TokenFilter {
+
+  private TokenStream input;
+
+  public UpperCaseTokenFilter(TokenStream input) {
+    super(input);
+    this.input = input;
+  }
+
+  @Override
+  public Token next() throws IOException {
+    Token nextToken = input.next();
+    if (nextToken == null)
+      return null;
+
+    String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
+    termText = termText.toUpperCase();
+
+    nextToken.setTermBuffer(termText.toCharArray(), 0, termText.length());
+
+    return nextToken;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+public class MapFileReader extends Reader {
+
+  private BufferedReader reader;
+
+  public MapFileReader(BufferedReader reader) {
+    super();
+    this.reader = reader;
+  }
+
+  public Map<String, String> readMap() throws IOException {
+    Map<String, String> mapping = new HashMap<String, String>();
+
+    String line = reader.readLine();
+    while (line != null) {
+      String[] keyValue = line.split("=");
+      String key = keyValue[0];
+      String value = keyValue[1];
+
+      mapping.put(key, value);
+      line = reader.readLine();
+    }
+
+    return mapping;
+  }
+
+  @Override
+  public void close() throws IOException {
+    reader.close();
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return reader.read(cbuf, off, len);
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class MultimapFileReader extends Reader {
+
+  private BufferedReader reader;
+
+  public MultimapFileReader(BufferedReader reader) {
+    super();
+    this.reader = reader;
+  }
+
+  public Map<String, List<String>> readMultimap() throws IOException {
+    Map<String, List<String>> multimap = new HashMap<String, List<String>>();
+
+    String line = reader.readLine();
+    while (line != null) {
+      String[] keyValue = line.split("=");
+      String term = keyValue[0];
+      String[] values = keyValue[1].split("\\|");
+      List<String> valueList = new ArrayList<String>();
+      for (String hypernym : values)
+        valueList.add(hypernym);
+
+      multimap.put(term, valueList);
+      line = reader.readLine();
+    }
+    return multimap;
+  }
+
+  @Override
+  public void close() throws IOException {
+    reader.close();
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return reader.read(cbuf, off, len);
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PlainFileReader extends Reader {
+
+  private BufferedReader reader;
+
+  public PlainFileReader(BufferedReader reader) {
+    super();
+    this.reader = reader;
+  }
+
+  public String[] readLines() throws IOException {
+    List<String> lines = new ArrayList<String>();
+
+    String line = reader.readLine();
+    while (line != null) {
+      lines.add(line.trim());
+      line = reader.readLine();
+    }
+    String[] stopwords = new String[lines.size()];
+    lines.toArray(stopwords);
+
+    return stopwords;
+  }
+
+  @Override
+  public void close() throws IOException {
+    reader.close();
+
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return reader.read(cbuf, off, len);
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java Sat May 30 11:12:58 2009
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TokenStreamStringConcatenator {
+
+  /**
+   * Builds a string from the tokens that can be found in tokenStream and delimites the tokens with
+   * a given delimiter.
+   * 
+   * @param ts
+   * @return a sting made of tokens
+   * @throws IOException
+   */
+  public String tokenStreamToStringWithDelimiter(TokenStream ts, String delimiter)
+          throws IOException {
+    String tokenString = "";
+    Token newToken = new Token();
+    while (true) {
+      Token token = ts.next(newToken);
+      if (token != null) {
+        tokenString =
+                tokenString.concat(new String(token.termBuffer(), 0, token.termLength())).concat(
+                        delimiter);
+      } else {
+        int lastIndex = tokenString.lastIndexOf(delimiter);
+        if (lastIndex >= 0) {
+          tokenString = tokenString.substring(0, lastIndex);
+        }
+        break;
+      }
+    }
+    return tokenString;
+  }
+}

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml Sat May 30 11:12:58 2009
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+  <implementationName>de.julielab.jules.consumer.LuceneCASIndexer</implementationName>
+  <processingResourceMetaData>
+    <name>LuceneCASIndexerDescriptor</name>
+    <description/>
+    <version>2.0</version>
+    <vendor>julielab</vendor>
+    <configurationParameters>
+      <configurationParameter>
+        <name>indexOutDir</name>
+        <description>defines the output directory where the index should be written</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>mappingFile</name>
+        <description>path to the mapping file</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>stopwordFile</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>hypernymFile</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>tokenMappingFile</name>
+        <description>Mapping file for replacement of  tokens.</description>
+        <type>String</type>
+        <multiValued>true</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>uniqueIndex</name>
+        <type>Boolean</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>ramBufferSize</name>
+        <description>Sets the ram buffer size of the index writer. See lucene docs for further information.</description>
+        <type>Integer</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>compoundFileFormat</name>
+        <description>Determines wether the index writer should use compound file format or not.</description>
+        <type>Boolean</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+    </configurationParameters>
+    <configurationParameterSettings>
+      <nameValuePair>
+        <name>mappingFile</name>
+        <value>
+          <string>src/test/resources/lucas.xml</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>indexOutDir</name>
+        <value>
+          <string>src/test/resources/testIndex</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>stopwordFile</name>
+        <value>
+          <string>src/test/resources/stopwords.txt</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>hypernymFile</name>
+        <value>
+          <string>src/test/resources/hypernyms.txt</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>tokenMappingFile</name>
+        <value>
+          <array>
+            <string>src/test/resources/tokenMapping.txt</string>
+          </array>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>uniqueIndex</name>
+        <value>
+          <boolean>true</boolean>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>ramBufferSize</name>
+        <value>
+          <integer>512</integer>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>compoundFileFormat</name>
+        <value>
+          <boolean>true</boolean>
+        </value>
+      </nameValuePair>
+    </configurationParameterSettings>
+    <typeSystemDescription/>
+    <typePriorities/>
+    <fsIndexCollection/>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs/>
+        <languagesSupported/>
+      </capability>
+    </capabilities>
+    <operationalProperties>
+      <modifiesCas>false</modifiesCas>
+      <multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+      <outputsNewCASes>false</outputsNewCASes>
+    </operationalProperties>
+  </processingResourceMetaData>
+  <resourceManagerConfiguration/>
+</casConsumerDescription>

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd Sat May 30 11:12:58 2009
@@ -0,0 +1,121 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+		   elementFormDefault="qualified">
+
+	<xs:simpleType name="indexType">
+  		<xs:restriction base="xs:string">
+    		<xs:enumeration value="yes"/>
+    		<xs:enumeration value="no"/>
+    		<xs:enumeration value="no_norms"/>
+    		<xs:enumeration value="no_tf"/>
+    		<xs:enumeration value="no_norms_tf"/>
+  		</xs:restriction>
+	</xs:simpleType>
+
+	<xs:simpleType name="termVectorType">
+  		<xs:restriction base="xs:string">
+    		<xs:enumeration value="no"/>
+    		<xs:enumeration value="positions"/>
+    		<xs:enumeration value="offsets"/>
+    		<xs:enumeration value="positions_offsets"/>
+  		</xs:restriction>
+	</xs:simpleType>
+
+	<xs:simpleType name="storedType">
+  		<xs:restriction base="xs:string">
+    		<xs:enumeration value="yes"/>
+    		<xs:enumeration value="no"/>
+    		<xs:enumeration value="compress"/>
+  		</xs:restriction>
+	</xs:simpleType>
+
+	<xs:simpleType name="tokenizerType">
+  		<xs:restriction base="xs:string">
+    		<xs:enumeration value="cas"/>
+    		<xs:enumeration value="whitespace"/>
+    		<xs:enumeration value="standard"/>
+  		</xs:restriction>
+	</xs:simpleType>
+
+	<xs:simpleType name="positionType">
+  		<xs:restriction base="xs:string">
+    		<xs:enumeration value="first"/>
+    		<xs:enumeration value="last"/>
+  		</xs:restriction>
+	</xs:simpleType>	
+	
+	<xs:element name="feature">
+		<xs:complexType>
+			<xs:attribute name="name" type="xs:string"/>
+			<xs:attribute name="uppercase" type="xs:boolean"/>
+			<xs:attribute name="lowercase" type="xs:boolean"/>
+			<xs:attribute name="numberFormat" type="xs:string"/>
+		</xs:complexType>
+	</xs:element>
+	
+	<xs:element name="annotation">
+		<xs:complexType>
+			<xs:sequence>
+				<xs:element ref="feature" minOccurs="0" maxOccurs="unbounded"/>
+			</xs:sequence>				
+			<xs:attribute name="type" type="xs:string"/>
+			<xs:attribute name="sofa" type="xs:string"/>
+			<xs:attribute name="featurePath" type="xs:string"/>
+			<xs:attribute name="concatString" type="xs:string"/>
+			<xs:attribute name="splitString" type="xs:string"/>
+			<xs:attribute name="prefix" type="xs:string"/>
+			<xs:attribute name="uppercase" type="xs:boolean"/>
+			<xs:attribute name="lowercase" type="xs:boolean"/>
+			<xs:attribute name="stopwordRemove" type="xs:boolean"/>
+			<xs:attribute name="position" type="positionType"/>
+			<xs:attribute name="addHypernyms" type="xs:boolean"/>
+			<xs:attribute name="mappingFile" type="xs:string"/>
+			<xs:attribute name="snowballFilter" type="xs:string"/>
+			<xs:attribute name="unique" type="xs:boolean"/>
+			<xs:attribute name="tokenizer" type="tokenizerType"/>			
+		</xs:complexType>
+	</xs:element>
+
+	<xs:element name="field">
+		<xs:complexType>
+			<xs:sequence>
+				<xs:element ref="annotation" maxOccurs="unbounded"/>
+			</xs:sequence>				
+			<xs:attribute name="name" type="xs:string"/>
+			<xs:attribute name="index" type="indexType"/>
+			<xs:attribute name="termVector" type="termVectorType"/>
+			<xs:attribute name="delimiter" type="xs:string"/>
+			<xs:attribute name="stored" type="storedType"/>
+			<xs:attribute name="merge" type="xs:boolean"/>			
+		</xs:complexType>
+	</xs:element>
+	
+	<xs:element name="fields">
+		<xs:complexType>
+			<xs:sequence>
+				<xs:element ref="field" maxOccurs="unbounded"/>
+			</xs:sequence>
+		</xs:complexType>
+	</xs:element>
+
+</xs:schema>
\ No newline at end of file

Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd
------------------------------------------------------------------------------
    svn:mime-type = text/plain