You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2009/05/30 13:13:01 UTC
svn commit: r780236 [2/6] - in /incubator/uima/sandbox/trunk/Lucas: ./ src/
src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/uima/ src/main/java/org/apache/uima/lucas/
src/main/java/org/apache/uima/lucas/co...
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java Sat May 30 11:12:58 2009
@@ -0,0 +1,574 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.text.Format;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterators;
+
+/**
+ *
+ * AnnotationTokenStream represents a TokenStream which extracts tokens from feature values of
+ * annotations of a given type from a JCas object. Each token has the start and end offset from the
+ * annotation object. This class supports only the following UIMA JCas types of features:
+ * <ol>
+ * <li>String</li>
+ * <li>StringArray</li>
+ * <li>FSArray</li>
+ * <li>Number types</li>
+ * </ol>
+ *
+ * @author landefeld
+ * @version 0.2
+ */
+public class AnnotationTokenStream extends TokenStream {
+
+ private JCas jCas;
+
+ private String featurePath;
+
+ private List<String> featureNames;
+
+ private String delimiter;
+
+ private Iterator<Annotation> annotationIterator; // iterates over annotations
+
+ private Iterator<FeatureStructure> featureStructureIterator; // iterates over feature structures
+
+ // stored in feature arrays of an
+ // annotation
+
+ private Iterator<String> featureValueIterator; // iterates over the features of a feature
+
+ // structure
+
+ private Annotation currentAnnotation;
+
+ private Type annotationType;
+
+ private Map<String, Format> featureFormats; // a optional map of format object for each feature
+
+ private static Logger logger = Logger.getLogger(AnnotationTokenStream.class);
+
+ private class NotNullPredicate<T> implements Predicate<T> {
+
+ public boolean apply(T object) {
+ return object != null;
+ }
+ }
+
+ /**
+ * Creates a TokenStream which extracts all coveredText feature values of annotations of a given
+ * type from a JCas object. Each token has the start and end offset of the annotation and takes
+ * the covered text value as termText.
+ *
+ * @param jCas
+ * the jCas
+ * @param typeName
+ * the type of the annotation
+ * @throws CASException
+ */
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName) throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featureNames = Collections.EMPTY_LIST;
+ this.featureFormats = Collections.EMPTY_MAP;
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+ }
+
+ /**
+ * Creates a TokenStream which extracts all feature values of a given feature name from
+ * annotations with a given type from a given JCas object. Each token has the start and end offset
+ * of the annotation and uses the feature value as term text.
+ *
+ * @param jCas
+ * the JCas object
+ * @param type
+ * the type of the annotation
+ * @param featureName
+ * the name of the feature from which the token text is build
+ * @param featureFormat
+ * optional format object to convert feature values to strings
+ * @throws CASException
+ */
+
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featureName,
+ Format featureFormat) throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featureNames = new ArrayList<String>();
+ if (featureFormat != null) {
+ featureFormats = new HashMap<String, Format>();
+ featureFormats.put(featureName, featureFormat);
+ } else
+ this.featureFormats = Collections.EMPTY_MAP;
+
+ featureNames.add(featureName);
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+
+ }
+
+ /**
+ * Creates a TokenStream which extracts all feature values of a given feature name list from
+ * annotations with a given type from a given JCas object. Each token has the start and end offset
+ * of the annotation and uses the concatenation of all the feature values as term text. Optionally
+ * the different feature values of an annotation can be concatenated with a delimiter.
+ *
+ * @param jCas
+ * the JCas object
+ * @param type
+ * the type of the annotation
+ * @param featureNames
+ * the name of the feature from which the token text is build
+ * @param delimiter
+ * a delimiter for concatenating the different feature values of an annotation object. If
+ * null a white space will be used.
+ * @param featureFormats
+ * optional map of format objects to convert feature values to strings - the key must be
+ * the feature name
+ * @throws CASException
+ */
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName,
+ List<String> featureNames, String delimiter, Map<String, Format> featureFormats)
+ throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featureNames = featureNames;
+ this.delimiter = delimiter;
+
+ if (featureFormats == null)
+ this.featureFormats = Collections.EMPTY_MAP;
+ else
+ this.featureFormats = featureFormats;
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+ }
+
+ /**
+ * Creates a TokenStream which extracts all feature values of a given feature name list from
+ * annotations with a given type from a given JCas object. Each token has the start and end offset
+ * of the annotation and uses the concatenation of all the feature values as term text.
+ *
+ * @param jCas
+ * the JCas object
+ * @param type
+ * the type of the annotation
+ * @param featureNames
+ * the name of the feature from which the token text is build
+ * @param featureFormats
+ * optional map of format objects to convert feature values to strings - the key must be
+ * the feature name
+ * @throws CASException
+ */
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName,
+ List<String> featureNames, Map<String, Format> featureFormats) throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featureNames = featureNames;
+ if (featureFormats == null)
+ this.featureFormats = Collections.EMPTY_MAP;
+ else
+ this.featureFormats = featureFormats;
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+ }
+
+ /**
+ * Creates a TokenStream which extracts all feature values of a given feature name list from
+ * annotations with a given type from a given JCas object. The addressed features are part of
+ * direct or indirect feature structure value of a annotation. For example a annotation of type
+ * person has a feature address which values are address feature structures with features for the
+ * street, postal code and city . To create tokens with postal code and city of a persons address,
+ * the featurePath must be "address" and the featureNames "postalCode" and
+ * "city". Each token has the start and end offset of the annotation and uses the
+ * concatenation of all the feature values as term text.
+ *
+ * @param jCas
+ * the JCas object
+ * @param type
+ * the type of the annotation
+ * @param featurePath
+ * the path to the feature structures which features should be used for tokens Path
+ * entries should be separated by ".". Example:
+ * "affiliation.address.country"
+ * @param featureNames
+ * the name of the feature from which the token text is build
+ * @param delimiter
+ * a delimiter for concatenating the different feature values of an annotation object. If
+ * null a white space will be used.
+ * @param featureFormats
+ * optional map of format objects to convert feature values to strings - the key must be
+ * the feature name
+ * @throws CASException
+ */
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featurePath,
+ List<String> featureNames, Map<String, Format> featureFormats) throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featurePath = featurePath;
+ this.featureNames = featureNames;
+ if (featureFormats == null)
+ this.featureFormats = Collections.EMPTY_MAP;
+ else
+ this.featureFormats = featureFormats;
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featurePath: " + featurePath);
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+ }
+
+ /**
+ * Creates a TokenStream which extracts all feature values of a given feature name list from
+ * annotations with a given type from a given JCas object. The addressed features are part of
+ * direct or indirect feature structure value of a annotation. For example a annotation of type
+ * person has a feature address which values are address feature structures with features for the
+ * street, postal code and city . To create tokens with postal code and city of a persons address,
+ * the featurePath must be "address" and the featureNames "postalCode" and
+ * "city". Each token has the start and end offset of the annotation and uses the
+ * concatenation of all the feature values as term text. Optionally the different feature values
+ * of an annotation can be concatenated with a delimiter.
+ *
+ * @param jCas
+ * the JCas object
+ * @param type
+ * the type of the annotation
+ * @param featurePath
+ * the path to the feature structures which features should be used for tokens Path
+ * entries should be separated by ".". Example:
+ * "affiliation.address.country"
+ * @param featureNames
+ * the name of the feature from which the token text is build
+ * @param delimiter
+ * a delimiter for concatenating the different feature values of an annotation object. If
+ * null a white space will be used.
+ * @param featureFormats
+ * optional map of format objects to convert feature values to strings - the key must be
+ * the feature name
+ * @throws CASException
+ */
+ public AnnotationTokenStream(JCas cas, String sofaName, String typeName, String featurePath,
+ List<String> featureNames, String delimiter, Map<String, Format> featureFormats)
+ throws CASException {
+ super();
+ jCas = cas.getView(sofaName);
+ this.featurePath = featurePath;
+ this.featureNames = featureNames;
+ this.delimiter = delimiter;
+ if (featureFormats == null)
+ this.featureFormats = Collections.EMPTY_MAP;
+ else
+ this.featureFormats = featureFormats;
+
+ try {
+ annotationType = jCas.getTypeSystem().getType(typeName);
+ logger.debug(typeName + ", found: " + (annotationType != null));
+ logger.debug("featurePath: " + featurePath);
+ logger.debug("featureNames: " + featureNames);
+ initializeIterators();
+ } catch (Exception e) {
+ IllegalArgumentException exc =
+ new IllegalArgumentException(e.getMessage() + " at type " + typeName);
+ exc.initCause(e);
+ throw exc;
+ }
+
+ }
+
+ @Override
+ public Token next(Token token) throws IOException {
+ try {
+ while (!featureValueIterator.hasNext()) {
+ while (!featureStructureIterator.hasNext()) {
+ if (!annotationIterator.hasNext())
+ return null;
+ currentAnnotation = (Annotation) annotationIterator.next();
+ featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath);
+ }
+
+ featureValueIterator =
+ createFeatureValueIterator(featureStructureIterator.next(), featureNames);
+ }
+
+ token.setStartOffset(currentAnnotation.getBegin());
+ token.setEndOffset(currentAnnotation.getEnd());
+
+ char[] value = featureValueIterator.next().toCharArray();
+ token.setTermBuffer(value, 0, value.length);
+ return token;
+
+ } catch (Throwable e) {
+
+ IOException ioException =
+ new IOException(e + " at type " + annotationType.getName() + " features "
+ + featureNames + " featurePath " + featurePath + " sofa "
+ + jCas.getViewName(), e);
+ logger.error(ioException);
+ throw ioException;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ @Override
+ public Token next() throws IOException {
+ return next(new Token());
+ }
+
+ protected void initializeIterators() {
+ annotationIterator =
+ Iterators.filter(jCas.getAnnotationIndex(annotationType).iterator(),
+ new NotNullPredicate<Annotation>());
+
+ if (!annotationIterator.hasNext()) {
+ featureStructureIterator = Iterators.emptyIterator();
+ featureValueIterator = Iterators.emptyIterator();
+ return;
+ }
+
+ currentAnnotation = (Annotation) annotationIterator.next();
+ featureStructureIterator = createFeatureStructureIterator(currentAnnotation, featurePath);
+ if (!featureStructureIterator.hasNext()) {
+ featureValueIterator = Iterators.emptyIterator();
+ return;
+ }
+
+ FeatureStructure featureStructure = featureStructureIterator.next();
+ featureValueIterator = createFeatureValueIterator(featureStructure, featureNames);
+ }
+
+ protected Iterator<FeatureStructure> createFeatureStructureIterator(Annotation annotation,
+ String featurePath) {
+ Collection<FeatureStructure> featureStructures = new LinkedList<FeatureStructure>();
+ Collection<FeatureStructure> childs = new LinkedList<FeatureStructure>();
+
+ if (featurePath == null) {
+ featureStructures.add(annotation);
+ return featureStructures.iterator();
+ }
+
+ Type currentType = annotation.getType();
+ if (currentType.isArray())
+ currentType = currentType.getComponentType();
+
+ String[] pathEntries = featurePath.split("\\.");
+ featureStructures.add(annotation);
+
+ for (String pathEntry : pathEntries) {
+ Feature feature = currentType.getFeatureByBaseName(pathEntry);
+ childs.clear();
+
+ if (feature.getRange().isArray()) {
+ for (FeatureStructure featureStructureItem : featureStructures) {
+ FSArray fsArray = (FSArray) featureStructureItem.getFeatureValue(feature);
+ if (fsArray == null)
+ continue;
+
+ for (int i = 0; i < fsArray.size(); i++)
+ childs.add(fsArray.get(i));
+ }
+ } else
+ for (FeatureStructure featureStructureItem : featureStructures)
+ childs.add(featureStructureItem.getFeatureValue(feature));
+
+ currentType = feature.getRange();
+ if (currentType.isArray())
+ currentType = currentType.getComponentType();
+
+ featureStructures.clear();
+ featureStructures.addAll(childs);
+ }
+
+ return Iterators.filter(featureStructures.iterator(), new NotNullPredicate<FeatureStructure>());
+ }
+
+ protected Iterator<String> createFeatureValueIterator(FeatureStructure srcFeatureStructure,
+ Collection<String> featureNames) {
+ List<String> values = new LinkedList<String>();
+ Type featureType = srcFeatureStructure.getType();
+
+ if (featureNames.size() == 0)
+ values.add(currentAnnotation.getCoveredText());
+
+ for (String featureName : featureNames) {
+ Feature feature = featureType.getFeatureByBaseName(featureName);
+ if (feature.getRange().isArray()) {
+ StringArray fsArray = (StringArray) srcFeatureStructure.getFeatureValue(feature);
+ if (featureNames.size() == 1) {
+ for (int i = 0; i < fsArray.size(); i++)
+ values.add(fsArray.get(i).toString());
+ } else {
+ String value = "";
+ for (int i = 0; i < fsArray.size(); i++) {
+ value = value.concat(fsArray.get(i).toString());
+ if (i < fsArray.size() - 1)
+ value = value.concat(delimiter);
+ }
+ values.add(value);
+ }
+ } else
+ values.add(getValueForFeature(srcFeatureStructure, feature, featureFormats.get(feature
+ .getShortName())));
+ }
+ String value = "";
+ if (delimiter != null) {
+ for (int i = 0; i < values.size(); i++) {
+ if (values.get(i) == null)
+ continue;
+
+ value = value.concat(values.get(i));
+ if (i < values.size() - 1)
+ value = value.concat(delimiter);
+ }
+ values.clear();
+ values.add(value);
+ }
+
+ return Iterators.filter(values.iterator(), new NotNullPredicate<String>());
+ }
+
+ public String getValueForFeature(FeatureStructure featureStructure, Feature feature, Format format) {
+ if (format == null)
+ return featureStructure.getFeatureValueAsString(feature);
+ else {
+ Object value = null;
+ if (feature.getRange().getName().equals(CAS.TYPE_NAME_DOUBLE))
+ value = featureStructure.getDoubleValue(feature);
+ else if (feature.getRange().getName().equals(CAS.TYPE_NAME_FLOAT))
+ value = featureStructure.getFloatValue(feature);
+ else if (feature.getRange().getName().equals(CAS.TYPE_NAME_LONG))
+ value = featureStructure.getLongValue(feature);
+ else if (feature.getRange().getName().equals(CAS.TYPE_NAME_INTEGER))
+ value = featureStructure.getIntValue(feature);
+ else if (feature.getRange().getName().equals(CAS.TYPE_NAME_SHORT))
+ value = featureStructure.getShortValue(feature);
+
+ return format.format(value);
+ }
+ }
+
+ public void reset() {
+ featureStructureIterator = null;
+ currentAnnotation = null;
+ featureFormats = Collections.EMPTY_MAP;
+ initializeIterators();
+ }
+
+ public Map<String, Format> getFeatureFormats() {
+ return featureFormats;
+ }
+
+ public JCas getJCas() {
+ return jCas;
+ }
+
+ public String getFeaturePath() {
+ return featurePath;
+ }
+
+ public List<String> getFeatureNames() {
+ return featureNames;
+ }
+
+ public String getDelimiter() {
+ return delimiter;
+ }
+
+ public Type getAnnotationType() {
+ return annotationType;
+ }
+
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/AnnotationTokenStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java Sat May 30 11:12:58 2009
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharTokenizer;
+
+public class DelimiterTokenizer extends CharTokenizer {
+
+ private char delimiter;
+
+ public DelimiterTokenizer(Reader arg0, char delimiter) {
+ super(arg0);
+ this.delimiter = delimiter;
+ }
+
+ @Override
+ protected boolean isTokenChar(char c) {
+ return !(c == delimiter);
+ }
+
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/DelimiterTokenizer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * TokenFilter subclass which adds hypernyms to a TokenStream based on a map.
+ */
+public class HypernymTokenFilter extends TokenFilter {
+
+ private static Logger logger = Logger.getLogger(HypernymTokenFilter.class);
+
+ private Map<String, List<String>> hypernyms;
+
+ private TokenStream tokenStream;
+
+ private int currentHypernymIndex;
+
+ private List<String> currentHypernyms;
+
+ private Token inputToken;
+
+ /**
+ * Constructor.
+ *
+ * @param input
+ * the input TokenStream
+ * @param hypernyms
+ * the hypernym map. key: token text, value: list of hypernyms
+ */
+ public HypernymTokenFilter(TokenStream input, Map<String, List<String>> hypernyms) {
+ super(input);
+ this.hypernyms = hypernyms;
+ this.tokenStream = input;
+ this.currentHypernymIndex = -1;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ if (currentHypernymIndex >= 0 && currentHypernymIndex < currentHypernyms.size()) {
+ Token hypernymToken =
+ new Token(currentHypernyms.get(currentHypernymIndex), inputToken.startOffset(),
+ inputToken.endOffset());
+ hypernymToken.setPositionIncrement(0);
+ logger
+ .debug("adding hypernym " + hypernymToken.termText() + " for :"
+ + inputToken.termText());
+ currentHypernymIndex++;
+ return hypernymToken;
+ } else if (currentHypernymIndex >= 0 && currentHypernymIndex == currentHypernyms.size()) {
+ currentHypernymIndex = -1;
+ currentHypernyms = null;
+ }
+
+ inputToken = tokenStream.next();
+ if (inputToken != null) {
+ currentHypernyms = hypernyms.get(inputToken.termText());
+ if (currentHypernyms != null)
+ currentHypernymIndex = 0;
+ }
+ return inputToken;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ inputToken = null;
+ currentHypernymIndex = -1;
+ if (currentHypernyms != null)
+ currentHypernyms = null;
+
+ tokenStream.reset();
+ }
+
+ public Map<String, List<String>> getHypernyms() {
+ return hypernyms;
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/HypernymTokenFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class PositionFilter extends TokenFilter {
+
+ public final static Integer FIRST_POSITION = 0;
+
+ public final static Integer LAST_POSITION = 1;
+
+ private TokenStream input;
+
+ private Token token;
+
+ private Integer position;
+
+ public PositionFilter(TokenStream input, Integer position) {
+ super(input);
+ this.input = input;
+ this.position = position;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ Token newToken = input.next();
+
+ if (position.equals(FIRST_POSITION)) {
+ if (token != null)
+ return null;
+ else {
+ token = newToken;
+ return newToken;
+ }
+ } else if (position.equals(LAST_POSITION)) {
+ Token lastToken = null;
+ while (newToken != null) {
+ lastToken = newToken;
+ newToken = input.next();
+ }
+ return lastToken;
+ } else
+ return newToken;
+ }
+
+ public Integer getPosition() {
+ return position;
+ }
+
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/PositionFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class ReplaceFilter extends TokenFilter {
+
+ private TokenStream input;
+
+ private Map<String, String> mapping;
+
+ public ReplaceFilter(TokenStream input, Map<String, String> mapping) {
+ super(input);
+ this.input = input;
+ this.mapping = mapping;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ Token token = input.next();
+ if (token == null)
+ return null;
+
+ String termText = new String(token.termBuffer(), 0, token.termLength());
+ String replaceText = mapping.get(termText);
+ if (replaceText != null)
+ token.setTermBuffer(replaceText.toCharArray(), 0, replaceText.length());
+
+ return token;
+ }
+
+ public Map<String, String> getMapping() {
+ return mapping;
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/ReplaceFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import com.google.common.collect.Iterators;
+
+public class SplitterFilter extends TokenFilter {
+
+ private String splitString;
+
+ private Iterator<String> splitIterator;
+
+ private Token currentToken;
+
+ public SplitterFilter(TokenStream input, String splitString) {
+ super(input);
+ this.splitString = splitString;
+ this.splitIterator = Iterators.emptyIterator();
+ }
+
+ @Override
+ public Token next(Token token) throws IOException {
+
+ if (!splitIterator.hasNext()) {
+ currentToken = input.next(token);
+ if (currentToken == null)
+ return null;
+
+ String tokenText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+ String[] splitts = tokenText.split(splitString);
+ splitIterator = Iterators.forArray(splitts, 0, splitts.length);
+ }
+
+ if (!splitIterator.hasNext())
+ return null;
+
+ token.setStartOffset(currentToken.startOffset());
+ token.setEndOffset(currentToken.endOffset());
+ char[] termBuffer = splitIterator.next().toCharArray();
+ token.setTermBuffer(termBuffer, 0, termBuffer.length);
+
+ return token;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ return next(new Token());
+ }
+
+ public String getSplitString() {
+ return splitString;
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SplitterFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java Sat May 30 11:12:58 2009
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenStreamStringConcatenator takes a {@link java.util.Collection Collection} of
+ * {@link org.apache.lucene.analysis.TokenStream tokenstreams} and concats them.
+ */
+public class TokenStreamConcatenator extends TokenStream {
+
+ private Collection<TokenStream> tokenStreams;
+
+ private Iterator<TokenStream> tokenStreamIterator;
+
+ private TokenStream currentTokenStream;
+
+ public TokenStreamConcatenator(Collection<TokenStream> tokenStreams) {
+ super();
+ this.tokenStreams = tokenStreams;
+ this.tokenStreamIterator = tokenStreams.iterator();
+
+ if (tokenStreamIterator.hasNext())
+ currentTokenStream = tokenStreamIterator.next();
+ }
+
+ @Override
+ public Token next() throws IOException {
+ if (currentTokenStream == null)
+ if (tokenStreamIterator.hasNext())
+ currentTokenStream = tokenStreamIterator.next();
+ else
+ return null;
+
+ Token nextToken = currentTokenStream.next();
+ while (nextToken == null) {
+ if (tokenStreamIterator.hasNext()) {
+ currentTokenStream = tokenStreamIterator.next();
+ nextToken = currentTokenStream.next();
+ } else
+ return null;
+ }
+
+ return nextToken;
+ }
+
+ public void reset() throws IOException {
+ for (TokenStream tokenStream : tokenStreams)
+ tokenStream.reset();
+
+ tokenStreamIterator = tokenStreams.iterator();
+
+ if (tokenStreamIterator.hasNext())
+ currentTokenStream = tokenStreamIterator.next();
+ else
+ currentTokenStream = null;
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamConcatenator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java Sat May 30 11:12:58 2009
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenStreamMerger merges a {@link java.util.List list} of
+ * {@link org.apache.lucene.analysis.TokenStream token streams} by the means of their token offsets.
+ * Adapts positionIncrement of tokens if their startOffset is exactly the same.
+ */
+public class TokenStreamMerger extends TokenStream {
+
+ private class TokenComparator implements Comparator<Token> {
+
+ public int compare(Token token1, Token token2) {
+
+ return token2.startOffset() - token1.startOffset();
+
+ }
+ }
+
+ private Collection<TokenStream> streams;
+
+ private int currentOffset;
+
+ private TokenComparator comparator;
+
+ private Map<Token, TokenStream> currentTokens;
+
+ private Stack<Token> sortedTokens;
+
+ private boolean initialized;
+
+ public TokenStreamMerger(Collection<TokenStream> streams) throws IOException {
+ super();
+ this.streams = streams;
+ this.comparator = new TokenComparator();
+ currentTokens = new LinkedHashMap<Token, TokenStream>();
+ currentOffset = -1;
+ sortedTokens = new Stack<Token>();
+
+ }
+
+ private void init() throws IOException {
+ for (TokenStream stream : streams) {
+ Token token = stream.next();
+ if (token != null)
+ currentTokens.put(token, stream);
+ }
+ rebuildSortedTokens();
+ initialized = true;
+ }
+
+ public void reset() throws IOException {
+ currentTokens.clear();
+ for (TokenStream stream : streams)
+ stream.reset();
+
+ currentOffset = -1;
+ sortedTokens.clear();
+ initialized = false;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ if (!initialized)
+ init();
+
+ if (sortedTokens.size() == 0)
+ return null;
+
+ Token currentToken = sortedTokens.pop();
+ currentTokens.remove(currentToken);
+ rebuildSortedTokens();
+
+ if (currentToken.startOffset() == currentOffset)
+ currentToken.setPositionIncrement(0);
+ else
+ currentToken.setPositionIncrement(1);
+
+ currentOffset = currentToken.startOffset();
+
+ return currentToken;
+ }
+
+ private void rebuildSortedTokens() throws IOException {
+ for (TokenStream stream : streams)
+ if (!currentTokens.values().contains(stream)) {
+ Token token = stream.next();
+ if (token != null)
+ currentTokens.put(token, stream);
+ }
+
+ sortedTokens.clear();
+ sortedTokens.addAll(currentTokens.keySet());
+ Collections.sort(sortedTokens, comparator);
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/TokenStreamMerger.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A UniqueFilter filters multiple occurrences of {@link org.apache.lucene.analysis.Token tokens}
+ * tokens with the same token text and removes them.
+ */
+public class UniqueFilter extends TokenFilter {
+
+ private TokenStream input;
+
+ private Collection<String> termTexts;
+
+ private Token currentToken;
+
+ public UniqueFilter(TokenStream input) {
+ super(input);
+ this.input = input;
+ termTexts = new HashSet<String>();
+ }
+
+ @Override
+ public Token next() throws IOException {
+ currentToken = input.next();
+ if (currentToken == null)
+ return null;
+
+ String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+ while (termTexts.contains(termText)) {
+ currentToken = input.next();
+ if (currentToken != null)
+ termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
+ else
+ termText = null;
+ }
+
+ if (currentToken == null)
+ return null;
+
+ termTexts.add(termText);
+ return currentToken;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ termTexts.clear();
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UniqueFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java Sat May 30 11:12:58 2009
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class UpperCaseTokenFilter extends TokenFilter {
+
+ private TokenStream input;
+
+ public UpperCaseTokenFilter(TokenStream input) {
+ super(input);
+ this.input = input;
+ }
+
+ @Override
+ public Token next() throws IOException {
+ Token nextToken = input.next();
+ if (nextToken == null)
+ return null;
+
+ String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
+ termText = termText.toUpperCase();
+
+ nextToken.setTermBuffer(termText.toCharArray(), 0, termText.length());
+
+ return nextToken;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/UpperCaseTokenFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+public class MapFileReader extends Reader {
+
+ private BufferedReader reader;
+
+ public MapFileReader(BufferedReader reader) {
+ super();
+ this.reader = reader;
+ }
+
+ public Map<String, String> readMap() throws IOException {
+ Map<String, String> mapping = new HashMap<String, String>();
+
+ String line = reader.readLine();
+ while (line != null) {
+ String[] keyValue = line.split("=");
+ String key = keyValue[0];
+ String value = keyValue[1];
+
+ mapping.put(key, value);
+ line = reader.readLine();
+ }
+
+ return mapping;
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return reader.read(cbuf, off, len);
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MapFileReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class MultimapFileReader extends Reader {
+
+ private BufferedReader reader;
+
+ public MultimapFileReader(BufferedReader reader) {
+ super();
+ this.reader = reader;
+ }
+
+ public Map<String, List<String>> readMultimap() throws IOException {
+ Map<String, List<String>> multimap = new HashMap<String, List<String>>();
+
+ String line = reader.readLine();
+ while (line != null) {
+ String[] keyValue = line.split("=");
+ String term = keyValue[0];
+ String[] values = keyValue[1].split("\\|");
+ List<String> valueList = new ArrayList<String>();
+ for (String hypernym : values)
+ valueList.add(hypernym);
+
+ multimap.put(term, valueList);
+ line = reader.readLine();
+ }
+ return multimap;
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return reader.read(cbuf, off, len);
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/MultimapFileReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java Sat May 30 11:12:58 2009
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+public class PlainFileReader extends Reader {
+
+ private BufferedReader reader;
+
+ public PlainFileReader(BufferedReader reader) {
+ super();
+ this.reader = reader;
+ }
+
+ public String[] readLines() throws IOException {
+ List<String> lines = new ArrayList<String>();
+
+ String line = reader.readLine();
+ while (line != null) {
+ lines.add(line.trim());
+ line = reader.readLine();
+ }
+ String[] stopwords = new String[lines.size()];
+ lines.toArray(stopwords);
+
+ return stopwords;
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return reader.read(cbuf, off, len);
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/PlainFileReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java Sat May 30 11:12:58 2009
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.lucas.indexer.util;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TokenStreamStringConcatenator {
+
+ /**
+ * Builds a string from the tokens that can be found in tokenStream and delimites the tokens with
+ * a given delimiter.
+ *
+ * @param ts
+ * @return a sting made of tokens
+ * @throws IOException
+ */
+ public String tokenStreamToStringWithDelimiter(TokenStream ts, String delimiter)
+ throws IOException {
+ String tokenString = "";
+ Token newToken = new Token();
+ while (true) {
+ Token token = ts.next(newToken);
+ if (token != null) {
+ tokenString =
+ tokenString.concat(new String(token.termBuffer(), 0, token.termLength())).concat(
+ delimiter);
+ } else {
+ int lastIndex = tokenString.lastIndexOf(delimiter);
+ if (lastIndex >= 0) {
+ tokenString = tokenString.substring(0, lastIndex);
+ }
+ break;
+ }
+ }
+ return tokenString;
+ }
+}
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/java/org/apache/uima/lucas/indexer/util/TokenStreamStringConcatenator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml Sat May 30 11:12:58 2009
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier">
+ <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+ <implementationName>de.julielab.jules.consumer.LuceneCASIndexer</implementationName>
+ <processingResourceMetaData>
+ <name>LuceneCASIndexerDescriptor</name>
+ <description/>
+ <version>2.0</version>
+ <vendor>julielab</vendor>
+ <configurationParameters>
+ <configurationParameter>
+ <name>indexOutDir</name>
+ <description>defines the output directory where the index should be written</description>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>mappingFile</name>
+ <description>path to the mapping file</description>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>stopwordFile</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>hypernymFile</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>tokenMappingFile</name>
+ <description>Mapping file for replacement of tokens.</description>
+ <type>String</type>
+ <multiValued>true</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>uniqueIndex</name>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>ramBufferSize</name>
+ <description>Sets the ram buffer size of the index writer. See lucene docs for further information.</description>
+ <type>Integer</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>compoundFileFormat</name>
+ <description>Determines wether the index writer should use compound file format or not.</description>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ </configurationParameters>
+ <configurationParameterSettings>
+ <nameValuePair>
+ <name>mappingFile</name>
+ <value>
+ <string>src/test/resources/lucas.xml</string>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>indexOutDir</name>
+ <value>
+ <string>src/test/resources/testIndex</string>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>stopwordFile</name>
+ <value>
+ <string>src/test/resources/stopwords.txt</string>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>hypernymFile</name>
+ <value>
+ <string>src/test/resources/hypernyms.txt</string>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>tokenMappingFile</name>
+ <value>
+ <array>
+ <string>src/test/resources/tokenMapping.txt</string>
+ </array>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>uniqueIndex</name>
+ <value>
+ <boolean>true</boolean>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>ramBufferSize</name>
+ <value>
+ <integer>512</integer>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>compoundFileFormat</name>
+ <value>
+ <boolean>true</boolean>
+ </value>
+ </nameValuePair>
+ </configurationParameterSettings>
+ <typeSystemDescription/>
+ <typePriorities/>
+ <fsIndexCollection/>
+ <capabilities>
+ <capability>
+ <inputs/>
+ <outputs/>
+ <languagesSupported/>
+ </capability>
+ </capabilities>
+ <operationalProperties>
+ <modifiesCas>false</modifiesCas>
+ <multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+ <outputsNewCASes>false</outputsNewCASes>
+ </operationalProperties>
+ </processingResourceMetaData>
+ <resourceManagerConfiguration/>
+</casConsumerDescription>
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/resources/LuceneCASIndexer.xml
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd?rev=780236&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd (added)
+++ incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd Sat May 30 11:12:58 2009
@@ -0,0 +1,121 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ elementFormDefault="qualified">
+
+ <xs:simpleType name="indexType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="yes"/>
+ <xs:enumeration value="no"/>
+ <xs:enumeration value="no_norms"/>
+ <xs:enumeration value="no_tf"/>
+ <xs:enumeration value="no_norms_tf"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="termVectorType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="no"/>
+ <xs:enumeration value="positions"/>
+ <xs:enumeration value="offsets"/>
+ <xs:enumeration value="positions_offsets"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="storedType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="yes"/>
+ <xs:enumeration value="no"/>
+ <xs:enumeration value="compress"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="tokenizerType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="cas"/>
+ <xs:enumeration value="whitespace"/>
+ <xs:enumeration value="standard"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:simpleType name="positionType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="first"/>
+ <xs:enumeration value="last"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="feature">
+ <xs:complexType>
+ <xs:attribute name="name" type="xs:string"/>
+ <xs:attribute name="uppercase" type="xs:boolean"/>
+ <xs:attribute name="lowercase" type="xs:boolean"/>
+ <xs:attribute name="numberFormat" type="xs:string"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="annotation">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="feature" minOccurs="0" maxOccurs="unbounded"/>
+ </xs:sequence>
+ <xs:attribute name="type" type="xs:string"/>
+ <xs:attribute name="sofa" type="xs:string"/>
+ <xs:attribute name="featurePath" type="xs:string"/>
+ <xs:attribute name="concatString" type="xs:string"/>
+ <xs:attribute name="splitString" type="xs:string"/>
+ <xs:attribute name="prefix" type="xs:string"/>
+ <xs:attribute name="uppercase" type="xs:boolean"/>
+ <xs:attribute name="lowercase" type="xs:boolean"/>
+ <xs:attribute name="stopwordRemove" type="xs:boolean"/>
+ <xs:attribute name="position" type="positionType"/>
+ <xs:attribute name="addHypernyms" type="xs:boolean"/>
+ <xs:attribute name="mappingFile" type="xs:string"/>
+ <xs:attribute name="snowballFilter" type="xs:string"/>
+ <xs:attribute name="unique" type="xs:boolean"/>
+ <xs:attribute name="tokenizer" type="tokenizerType"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="field">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="annotation" maxOccurs="unbounded"/>
+ </xs:sequence>
+ <xs:attribute name="name" type="xs:string"/>
+ <xs:attribute name="index" type="indexType"/>
+ <xs:attribute name="termVector" type="termVectorType"/>
+ <xs:attribute name="delimiter" type="xs:string"/>
+ <xs:attribute name="stored" type="storedType"/>
+ <xs:attribute name="merge" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="fields">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="field" maxOccurs="unbounded"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+</xs:schema>
\ No newline at end of file
Propchange: incubator/uima/sandbox/trunk/Lucas/src/main/resources/lucas.xsd
------------------------------------------------------------------------------
svn:mime-type = text/plain