You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/01 17:25:50 UTC

svn commit: r1152824 [19/21] - in /uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker: ./ action/ batch/ condition/ engine/ kernel/ kernel/constraint/ kernel/expression/ kernel/expression/bool...

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java Mon Aug  1 15:24:44 2011
@@ -0,0 +1,1072 @@
+package org.apache.uima.tm.textmarker.resource.trie;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.tm.textmarker.kernel.TextMarkerStream;
+import org.apache.uima.tm.textmarker.kernel.type.TextMarkerBasic;
+import org.apache.uima.tm.textmarker.resource.TextMarkerWordList;
+
+
+/**
+ * Class MultiTreeWordList.
+ * 
+ * @author Daniel Wieth, 1570292
+ * 
+ */
+public class MultiTreeWordList implements TextMarkerWordList {
+
+  private static final String ENCODING = "UTF-8";
+
+  private MultiTreeWordListPersistence persistence = new MultiTreeWordListPersistence();;
+
+  /** The root of the TreeWordList. */
+  protected MultiTextNode root;
+
+  /** The cost model we are using. */
+  private EditDistanceCostMap costMap;
+
+  /**
+   * Default constructor.
+   */
+  public MultiTreeWordList() {
+    this(new String[] {});
+  }
+
+  /**
+   * Default constructor uses just one file.
+   * 
+   * @param pathname
+   *          the pathname of the used file.
+   */
+  public MultiTreeWordList(String pathname) {
+
+    this.root = new MultiTextNode();
+    this.costMap = new EditDistanceCostMap();
+    File directory = new File(pathname);
+
+    if (!directory.isDirectory()) {
+      if (directory.getName().endsWith(".txt")) {
+        buildNewTree(directory.getAbsolutePath());
+      }
+      if (directory.getName().endsWith(".mtwl")) {
+        persistence.readMTWL(root, directory.getAbsolutePath());
+      }
+      return;
+    }
+
+    File[] listFiles = directory.listFiles();
+
+    for (File data : listFiles) {
+      if (data.getName().endsWith(".txt")) {
+        buildNewTree(data.getAbsolutePath());
+      }
+      if (data.getName().endsWith(".mtwl")) {
+        persistence.readMTWL(root, data.getAbsolutePath());
+      }
+    }
+  }
+
+  /**
+   * Constructs a TreeWordList from a file with path = filename
+   * 
+   * @param filename
+   *          path of the file to create a TextWordList from
+   */
+  public MultiTreeWordList(String[] pathnames) {
+
+    this.root = new MultiTextNode();
+    this.costMap = new EditDistanceCostMap();
+
+    for (String pathname : pathnames) {
+
+      if (pathname.endsWith(".mtwl")) {
+        persistence.readMTWL(root, pathname);
+      }
+      if (pathname.endsWith(".txt")) {
+        buildNewTree(pathname);
+      }
+    }
+  }
+
+  /**
+   * Creates a new Tree in the existing treeWordList from a file with path pathname
+   * 
+   * @param pathname
+   *          Absolut path of the file containing the word for the treeWordList
+   */
+  public void buildNewTree(String pathname) {
+
+    try {
+      File f = new File(pathname);
+      FileInputStream fstream = new FileInputStream(f);
+      BufferedReader br = new BufferedReader(new InputStreamReader(fstream, ENCODING));
+      String s = null;
+
+      while ((s = br.readLine()) != null) {
+        addWord(s.trim(), f.getName());
+      }
+      fstream.close();
+      br.close();
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * Add a new String into the MultiTreeWordList.
+   * 
+   * @param s
+   *          The String to add
+   * @param type
+   *          The type of the string.
+   */
+  public void addWord(String s, String type) {
+
+    // Create Nodes from all chars of the strings besides the last one
+    MultiTextNode pointer = root;
+
+    for (Character each : s.toCharArray()) {
+
+      MultiTextNode childNode = pointer.getChildNode(each);
+
+      if (childNode == null) {
+        childNode = new MultiTextNode(each, false);
+        pointer.addChild(childNode);
+      }
+
+      pointer = childNode;
+    }
+    pointer.setWordEnd(s.length() > 0);
+    pointer.addType(type);
+  }
+
+  /**
+   * Returns all Types contained by the MultiTreeWordList.
+   * 
+   * @return all Types contained by the MultiTreeWordList.
+   */
+  public Collection<String> getTypes() {
+    return getTypeCone(root);
+  }
+
+  /**
+   * Returns all types contained by the cone of the MultiTextNode node, including the types of node
+   * itself.
+   * 
+   * @param node
+   *          The node where we start, the root of the cone.
+   * @return all types contained by the cone of the MultiTextNode node, including the types of node
+   *         itself.
+   */
+  public Collection<String> getTypeCone(MultiTextNode node) {
+
+    List<String> returnList = new LinkedList<String>();
+
+    if (node.getTypes() != null) {
+      for (String s : node.getTypes()) {
+        if (!returnList.contains(s)) {
+          returnList.add(s);
+        }
+      }
+    }
+
+    for (Character c : node.getChildren().keySet()) {
+      for (String s : getTypeCone(node.getChildNode(c))) {
+        if (!returnList.contains(s)) {
+          returnList.add(s);
+        }
+      }
+    }
+
+    return returnList;
+  }
+
+  /**
+   * Returns all strings contained by the MultiTreeWordList.
+   * 
+   * @return All strings contained by the MultiTreeWordList.
+   */
+  public Collection<String> keySet() {
+    List<String> keySet = new LinkedList<String>(keySet(root, ""));
+    Collections.sort(keySet);
+    return keySet;
+  }
+
+  /**
+   * Returns all strings contained by the cone of the MultiTextNode node and uses prefix as the
+   * prefix of all the strings.
+   * 
+   * @param node
+   *          the node we are considering.
+   * @param prefix
+   *          the prefix until now.
+   * @return All strings contained by the cone of the MultiTextNode node.
+   */
+  private Collection<String> keySet(MultiTextNode node, String prefix) {
+
+    List<String> resultList = new LinkedList<String>();
+
+    // Recursion stop.
+    if (node.isWordEnd()) {
+      resultList.add(prefix);
+    }
+
+    // Recursion step.
+    for (Character c : node.getChildren().keySet()) {
+      String temp = prefix + String.valueOf(c);
+      resultList.addAll(keySet(node.getChildNode(c), temp));
+    }
+
+    return resultList;
+  }
+
+  /**
+   * Returns all types of the very string s.
+   * 
+   * @param s
+   *          The string with the types.
+   * @return All types from the very string s.
+   */
+  public Collection<String> getTypes(String s) {
+    return getTypes(s, false);
+  }
+
+  /**
+   * Returns the types of the string s.
+   * 
+   * @param s
+   *          The string with the types.
+   * @param ignoreCase
+   *          Indicates, whether we search case sensitive or not.
+   * @return The types of the string s.
+   */
+  public Collection<String> getTypes(String s, boolean ignoreCase) {
+
+    // Collection<Set<String>> types = editDistanceClever(root, s, "", 0.0,
+    // 0,
+    // ignoreCase, false, costMap, false, false).values();
+    // Map<String, Set<String>> types = editDistanceClever(root, s, "", 0.0,
+    // 0, ignoreCase, false, costMap, false, false);
+    Map<String, Set<String>> types = editDistance(s, 0, ignoreCase, "");
+    Set<String> returnSet = new HashSet<String>();
+
+    for (Entry<String, Set<String>> each : types.entrySet()) {
+      returnSet.addAll(each.getValue());
+    }
+
+    return returnSet;
+  }
+
+  /**
+   * Returns a list of types which belong to a string.
+   * 
+   * @param string
+   *          The string which types we want to have.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param ignoreLength
+   *          If the length of the string is less than of equal to this, we search case insensitive.
+   * @param edit
+   *          Indicates whether we use an edit distance or not.
+   * @param distance
+   *          The edit distance to a string contained by the MultiTreeWordList.
+   * @param ignoreToken
+   *          Characters which can be ignored.
+   * @return Returns a list of types which belong to a string.
+   */
+
+  public List<String> contains(String string, boolean ignoreCase, int ignoreLength, boolean edit,
+          double distance, String ignoreToken) {
+
+    List<String> resultList = new LinkedList<String>();
+    Map<String, Set<String>> editDistance;
+
+    if (string.length() >= ignoreLength && ignoreCase) {
+      editDistance = editDistance(string, (int) distance, true, ignoreToken, false);
+    } else {
+      editDistance = editDistance(string, (int) distance, false, ignoreToken, false);
+    }
+    for (Entry<String, Set<String>> each : editDistance.entrySet()) {
+      resultList.addAll(each.getValue());
+    }
+    return resultList;
+  }
+
+  /**
+   * Checks whether a string is contained by the MultiTreeWordList or not.
+   * 
+   * @param string
+   *          The string which is contained or not.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param ignoreLength
+   *          If the length of the string is less than of equal to this, we search case insensitive.
+   * @param edit
+   *          Indicates whether we use an edit distance or not.
+   * @param distance
+   *          The edit distance to a string contained by the MultiTreeWordList.
+   * @param ignoreToken
+   *          Characters which can be ignored.
+   * @return true, if the string is contained by the MultiTreeWordList, false otherwise.
+   */
+  public boolean containsBool(String string, boolean ignoreCase, int ignoreLength, boolean edit,
+          double distance, String ignoreToken) {
+    return editDistanceBool(root, string, "", distance, 0, ignoreCase, false, costMap);
+  }
+
+  /**
+   * Checks whether the tree contains exaclty the string s.
+   * 
+   * @param s
+   *          The string which is contained or not.
+   * @return True, if the TreeWordList contains exactly the string s, false otherwise.
+   */
+  public boolean contains(String s) {
+    return contains(s, false);
+  }
+
+  /**
+   * Checks whether the tree contains the string s.
+   * 
+   * @param s
+   *          The string which is contained or not.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @return True, if the TreeWordList contains the string s, false otherwise.
+   */
+  public boolean contains(String s, boolean ignoreCase) {
+    return contains(s, ignoreCase, 0, new char[] {}, 0);
+  }
+
+  /**
+   * Checks if the MultiTreeWordList contains the string s.
+   * 
+   * @param s
+   *          The string which is contained or not.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param size
+   *          The index of the string.
+   * @param ignoreChars
+   *          Characters which can be ignored.
+   * @param maxIgnoreChars
+   *          The maximum number of ignored characters.
+   * @return true, if TreeWordList contains the string, false otherwise.
+   */
+  public boolean contains(String s, boolean ignoreCase, int size, char[] ignoreChars,
+          int maxIgnoreChars) {
+
+    EditDistanceCostMap edm = new EditDistanceCostMap();
+
+    for (Character c : ignoreChars) {
+      edm.setDeleteCosts(c, 0.0);
+    }
+
+    return editDistanceBool(root, s, "", maxIgnoreChars, 0, ignoreCase, false, edm);
+  }
+
+  /**
+   * Checks if the MultiTreeWordList contains a prefix of the string s.
+   * 
+   * @param s
+   *          The string which is contained or not.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param size
+   *          The index of the string.
+   * @param ignoreChars
+   *          Characters which can be ignored.
+   * @param maxIgnoreChars
+   *          The maximum number of ignored characters.
+   * @return true, if TreeWordList contains a prefix of the string, false otherwise.
+   */
+  public boolean containsFragment(String s, boolean ignoreCase, int size, char[] ignoreChars,
+          int maxIgnoreChars) {
+    MultiTextNode pointer = root;
+    return recursiveContains(pointer, s, 0, ignoreCase && s.length() > size, true, ignoreChars,
+            maxIgnoreChars);
+  }
+
+  /**
+   * Checks whether prefix of a string is contained by the MultiTreeWordList or not.
+   * 
+   * @param string
+   *          The string whose prefix is contained or not.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param ignoreLength
+   *          If the length of the string is less than of equal to this, we search case insensitive.
+   * @param edit
+   *          Indicates whether we use an edit distance or not.
+   * @param distance
+   *          The edit distance to a string contained by the MultiTreeWordList.
+   * @param ignoreToken
+   *          Characters which can be ignored.
+   * @return true, if a prefix of the string is contained by the MultiTreeWordList, false otherwise.
+   */
+  public boolean containsFragmentBool(String string, boolean ignoreCase, int ignoreLength,
+          boolean edit, double distance, String ignoreToken) {
+
+    if (string.length() >= ignoreLength && ignoreCase) {
+      return editDistanceBool(root, string, "", distance, 0, true, true, costMap);
+    } else {
+      return editDistanceBool(root, string, "", distance, 0, false, true, costMap);
+    }
+  }
+
+  /**
+   * Returns a list of types which belong to a prefix of a string that is contained by the
+   * MultiTreeWordList.
+   * 
+   * @param string
+   *          The string whose prefix's types we are interested in.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param ignoreLength
+   *          If the length of the string is less than of equal to this, we search case insensitive.
+   * @param edit
+   *          Indicates whether we use an edit distance or not.
+   * @param distance
+   *          The edit distance to a string contained by the MultiTreeWordList.
+   * @param ignoreToken
+   *          Characters which can be ignored.
+   * @return A list of types which belong to a prefix of a string that is contained by the
+   *         MultiTreeWordList.
+   */
+  public List<String> containsFragment(String string, boolean ignoreCase, int ignoreLength,
+          boolean edit, double distance, String ignoreToken) {
+
+    List<String> resultList = new LinkedList<String>();
+    Map<String, Set<String>> resultMap = null;
+
+    if (string.length() >= ignoreLength && ignoreCase) {
+      resultMap = editDistance(string, (int) distance, true, ignoreToken, true);
+    } else {
+      resultMap = editDistance(string, (int) distance, false, ignoreToken, true);
+    }
+
+    for (Set<String> set : resultMap.values()) {
+      for (String s : set) {
+        if (!resultList.contains(s)) {
+          // resultList.addAll(resultMap.get(set));
+          resultList.add(s);
+        }
+      }
+    }
+
+    return resultList;
+  }
+
+  /**
+   * Returns true, if the MultiTreeWordList contains the string text, false otherwise.
+   * 
+   * @param pointer
+   *          The MultiTextNode we are looking at.
+   * @param text
+   *          The string which is contained or not.
+   * @param index
+   *          The index of the string text we checked until now.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param fragment
+   *          Indicates whether we are looking for a prefix of the string text.
+   * @param ignoreChars
+   *          Characters which can be ignored.
+   * @param maxIgnoreChars
+   *          Maximum number of characters which are allowed to be ignored.
+   * @return True, if the TreeWordList contains the string text, false otherwise.
+   */
+  private boolean recursiveContains(MultiTextNode pointer, String text, int index,
+          boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars) {
+
+    if (pointer == null) {
+      return false;
+    }
+
+    if (index == text.length()) {
+      return fragment || pointer.isWordEnd();
+    }
+
+    char charAt = text.charAt(index);
+    boolean charAtIgnored = false;
+
+    if (ignoreChars != null) {
+      for (char each : ignoreChars) {
+        if (each == charAt) {
+          charAtIgnored = true;
+          break;
+        }
+      }
+      charAtIgnored &= index != 0;
+    }
+
+    int next = ++index;
+
+    if (ignoreCase) {
+
+      // Lower Case Node.
+      MultiTextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt));
+
+      // Upper Case Node.
+      MultiTextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt));
+
+      if (charAtIgnored && childNodeL == null && childNodeU == null) {
+        // Character is ignored and does not appear.
+        return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
+                maxIgnoreChars);
+      } else {
+        // Recursion.
+        return recursiveContains(childNodeL, text, next, ignoreCase, fragment, ignoreChars,
+                maxIgnoreChars)
+                || recursiveContains(childNodeU, text, next, ignoreCase, fragment, ignoreChars,
+                        maxIgnoreChars);
+      }
+
+    } else {
+      // Case sensitive.
+      MultiTextNode childNode = pointer.getChildNode(charAt);
+
+      if (charAtIgnored && childNode == null) {
+        // Recursion with incremented index.
+        return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
+                maxIgnoreChars);
+      } else {
+        // Recursion with new node.
+        return recursiveContains(childNode, text, next, ignoreCase, fragment, ignoreChars,
+                maxIgnoreChars);
+      }
+    }
+  }
+
+  public Collection<AnnotationFS> find(TextMarkerStream stream, Map<String, Type> typeMap,
+          boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) {
+
+    Collection<AnnotationFS> results = new HashSet<AnnotationFS>();
+    stream.moveToFirst();
+    FSIterator<AnnotationFS> streamPointer = stream.copy();
+
+    while (stream.isValid()) {
+      TextMarkerBasic anchorBasic = (TextMarkerBasic) stream.get();
+      streamPointer.moveTo(anchorBasic);
+
+      List<TextMarkerBasic> basicsToAdd = new ArrayList<TextMarkerBasic>();
+      basicsToAdd.add(anchorBasic);
+      String text = anchorBasic.getCoveredText();
+      StringBuilder candidate = new StringBuilder(text);
+      String lastCandidate = candidate.toString();
+      List<AnnotationFS> interResults = new ArrayList<AnnotationFS>();
+
+      while (streamPointer.isValid()) {
+
+        List<String> types = containsFragment(candidate.toString(), ignoreCase, ignoreLength, edit,
+                distance, ignoreToken);
+
+        if (!types.isEmpty()) {
+          streamPointer.moveToNext();
+          if (streamPointer.isValid()) {
+            TextMarkerBasic next = (TextMarkerBasic) streamPointer.get();
+            // List<String> contains = contains(candidate,
+            // ignoreCase,
+            // ignoreLength, edit, distance, ignoreToken);
+
+            tryToCreateAnnotation(stream, results, basicsToAdd, candidate.toString(), interResults,
+                    ignoreCase, ignoreLength, edit, distance, ignoreToken, typeMap);
+            //			
+            lastCandidate = candidate.toString();
+            candidate.append(next.getCoveredText());
+            basicsToAdd.add(next);
+
+          } else {
+            // !streamPointer.isValid();
+            tryToCreateAnnotation(stream, results, basicsToAdd, lastCandidate, interResults,
+                    ignoreCase, ignoreLength, edit, distance, ignoreToken, typeMap);
+          }
+        } else {
+
+          // containsFragment.isEmpty();
+          // basicsToAdd.remove(basicsToAdd.size() - 1);
+          // tryToCreateAnnotation(stream, results, basicsToAdd,
+          // lastCandidate, interResults, ignoreCase,
+          // ignoreLength, edit, distance, ignoreToken, typeMap);
+
+          // breaks inner while()-loop.
+          break;
+        }
+
+      }
+
+      stream.moveToNext();
+    }
+
+    return results;
+  }
+
+  public List<AnnotationFS> find(TextMarkerStream stream, boolean ignoreCase, int size,
+          char[] ignoreChars, int maxIgnoredChars) {
+    assert false;
+    return new ArrayList<AnnotationFS>();
+  }
+
+  private void tryToCreateAnnotation(TextMarkerStream stream, Collection<AnnotationFS> results,
+          List<TextMarkerBasic> basicsToAdd, String lastCandidate, List<AnnotationFS> interResult,
+          boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken,
+          Map<String, Type> map) {
+
+    List<String> contains = contains(lastCandidate, ignoreCase, ignoreLength, edit, distance,
+            ignoreToken);
+    if (basicsToAdd.size() >= 1 || contains.isEmpty()) {
+      for (String each : contains) {
+        Type type = map.get(each);
+        if (type != null) {
+          int begin = basicsToAdd.get(0).getBegin();
+          int end = basicsToAdd.get(basicsToAdd.size() - 1).getEnd();
+          AnnotationFS newFS = stream.getCas().createAnnotation(type, begin, end);
+          results.add(newFS);
+        }
+      }
+    } else if (interResult != null && !interResult.isEmpty()) {
+      results.addAll(interResult);
+    }
+  }
+
+  /**
+   * Returns a map with all strings with a specified edit distance to the string query as keys and
+   * the files they belong to as values.
+   * 
+   * @param query
+   *          The query string.
+   * @return A map with all strings with a specified edit distance to the string query as keys and
+   *         the files they belong to as values.
+   */
+  public Map<String, Set<String>> editDistance(String query, int distance) {
+    return editDistance(query, distance, false, "");
+  }
+
+  /**
+   * Returns a map with all strings with a specified edit distance to the string query as keys and
+   * the files they belong to as values.
+   * 
+   * @param query
+   *          The query string.
+   * @param distance
+   *          The specified edit distance.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @return A map with all strings with a specified edit distance to the string query as keys and
+   *         the files they belong to as values.
+   */
+  public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
+          String ignoreToken) {
+    return editDistance(query, distance, ignoreCase, ignoreToken, false);
+  }
+
+  /**
+   * Returns a map with all strings with a specified edit distance to the string query as keys and
+   * the files they belong to as values.
+   * 
+   * @param query
+   *          The query string.
+   * @param distance
+   *          The specified edit distance.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param fragment
+   *          Indicates whether we search for fragments of the query string or not.
+   * @return A map with all strings with a specified edit distance to the string query as keys and
+   *         the files they belong to as values.
+   */
+  public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
+          String ignoreToken, boolean fragment) {
+
+    // The second alternative realizes the fragment functionality by
+    // setting the insert costs of the ignored character to zero. This
+    // is much more elegant and easier to maintain. I don't know if the
+    // other way is faster, so I did not delete it yet.
+
+    Map<Character, Double> oldInsertCosts = new HashMap<Character, Double>();
+    EditDistanceCostMap edcm = new EditDistanceCostMap();
+
+    // We need to store the old insert costs before we set them to zero.
+    for (char c : ignoreToken.toCharArray()) {
+      oldInsertCosts.put(c, edcm.getInsertCosts(c));
+      edcm.setInsertCosts(c, 0.0);
+    }
+
+    Map<String, Set<String>> result = null;
+
+    if (ignoreCase) {
+      result = editDistanceClever(root, query.toLowerCase(), "", distance, 0, true, fragment, edcm,
+              false, false);
+    } else {
+      result = editDistanceClever(root, query, "", distance, 0, false, fragment, edcm, false, false);
+    }
+
+    // Restoring of the old insert costs.
+    for (Entry<Character, Double> c : oldInsertCosts.entrySet()) {
+      edcm.setDeleteCosts(c.getKey(), c.getValue());
+    }
+
+    return result;
+  }
+
+  /**
+   * Returns a map with all strings with a specified edit distance to the string query as keys and
+   * the files they belong to as values.
+   * 
+   * @param node
+   *          The MultiTextNode which is under consideration at the moment.
+   * @param query
+   *          The query string.
+   * @param result
+   *          The result which matched until now.
+   * @param distance
+   *          The remaining edit distance.
+   * @param index
+   *          The index of the query string at the moment.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param fragment
+   *          Indicates whether we search for fragments of the query string or not.
+   * @param edm
+   *          The edit distance cost map we are using.
+   * @param lastActionInsert
+   *          Indicates whether the last action was an insert action.
+   * @param lastActionDelete
+   *          Indicates whether the last action was a delete action.
+   * @return A map with all strings with a specified edit distance to the string query as keys and
+   *         the files they belong to as values.
+   */
+  private Map<String, Set<String>> editDistanceClever(MultiTextNode node, String query,
+          String result, double distance, int index, boolean ignoreCase, boolean fragment,
+          EditDistanceCostMap edm, boolean lastActionInsert, boolean lastActionDelete) {
+
+    EditDistanceResultMap resultMap = new EditDistanceResultMap();
+
+    if (!lastActionInsert) {
+      // Delete.
+      if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
+        resultMap.putAll(editDistanceClever(node, query, result, distance
+                - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment, edm, false,
+                true));
+      }
+    }
+
+    // Recursion stop.
+    if (node.isWordEnd() || fragment) {
+
+      HashMap<String, Set<String>> temp = new HashMap<String, Set<String>>();
+
+      double remainingInsertCosts = 0.0;
+
+      // Accumulating remaining insert costs if the query is longer than
+      // the word in the trie.
+      for (int i = index; i < query.length(); i++) {
+        remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
+      }
+
+      if (remainingInsertCosts <= distance) {
+        // if (remainingInsertCosts <= distance &&
+        // !node.getTypes().isEmpty()) {
+        // if (query.length() - index <= distance) {
+
+        if (fragment) {
+          temp.put(result, new HashSet<String>(getTypeCone(node)));
+        } else {
+          temp.put(result, new HashSet<String>(node.getTypes()));
+        }
+
+        resultMap.putAll(temp);
+      }
+
+      // Important: word end does not mean no children any more!
+      if (node.getChildren() == null) {
+        return resultMap;
+      }
+    }
+
+    // Recursion.
+    for (MultiTextNode tempNode : node.getChildren().values()) {
+
+      if (index < query.length()) {
+        if (ignoreCase) {
+          if (Character.toLowerCase(tempNode.getValue()) == Character.toLowerCase(query
+                  .charAt(index))) {
+            resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
+                    distance, index + 1, ignoreCase, fragment, edm, false, false));
+          }
+        } else {
+          if (tempNode.getValue() == query.charAt(index)) {
+            resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
+                    distance, index + 1, ignoreCase, fragment, edm, false, false));
+          }
+        }
+      }
+
+      if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {
+
+        // Substitute.
+        resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(), distance
+                - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1, ignoreCase,
+                fragment, edm, false, false));
+      }
+
+      if (!lastActionDelete) {
+        if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
+          // Insert - use the same index twice.
+          resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
+                  distance - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment,
+                  edm, true, false));
+        }
+      }
+    }
+
+    return resultMap;
+  }
+
+  /**
+   * Checks if a string is contained by the MultiTreeWordList.
+   * 
+   * @param node
+   *          The MultiTextNode which is under consideration at the moment.
+   * @param query
+   *          The query string.
+   * @param result
+   *          The result which matched until now.
+   * @param distance
+   *          The remaining edit distance.
+   * @param index
+   *          The index of the query string at the moment.
+   * @param ignoreCase
+   *          Indicates whether we search case sensitive or not.
+   * @param fragment
+   *          Indicates whether we search for fragments of the query string or not.
+   * @param edm
+   *          The edit distance cost map we are using.
+   * @return A map with all strings with a specified edit distance to the string query as keys and
+   *         the files they belong to as values.
+   */
+  private boolean editDistanceBool(MultiTextNode node, String query, String result,
+          double distance, int index, boolean ignoreCase, boolean fragment, EditDistanceCostMap edm) {
+
+    boolean deletion = false;
+    boolean insertion = false;
+    boolean substitution = false;
+    boolean noop = false;
+
+    // Recursion stop.
+    if (fragment) {
+      if (index == query.length()) {
+        return true;
+      }
+    }
+
+    if (node.isWordEnd()) {
+
+      double remainingInsertCosts = 0.0;
+
+      // Accumulating remaining insert costs if the query is longer than
+      // the word in the trie.
+      for (int i = index; i < query.length(); i++) {
+        remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
+      }
+
+      if (remainingInsertCosts <= distance) {
+        // if (query.length() - index <= distance) {
+        return true;
+      }
+    }
+
+    // Delete.
+    if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
+      deletion = editDistanceBool(node, query, result, distance
+              - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment, edm);
+
+      if (deletion) {
+        return true;
+      }
+    }
+
+    // Recursion.
+    for (MultiTextNode tempNode : node.getChildren().values()) {
+
+      if (index < query.length()) {
+        if (ignoreCase) {
+          if (Character.toLowerCase(tempNode.getValue()) == Character.toLowerCase(query
+                  .charAt(index))) {
+            noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
+                    index + 1, ignoreCase, fragment, edm);
+          }
+        } else {
+          if (tempNode.getValue() == query.charAt(index)) {
+            noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
+                    index + 1, ignoreCase, fragment, edm);
+          }
+        }
+
+        if (noop) {
+          return true;
+        }
+      }
+
+      if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {
+
+        // Substitute.
+        substitution = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance
+                - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1, ignoreCase,
+                fragment, edm);
+
+        if (substitution) {
+          return true;
+        }
+      }
+
+      if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
+        // Insert - use the same index twice.
+        insertion = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance
+                - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment, edm);
+
+        if (insertion) {
+          return true;
+        }
+      }
+
+    }
+
+    return false;
+  }
+
+  // private Map<String, Set<String>> editDistance(MultiTextNode node, String query, String result,
+  // double distance, int index, boolean ignoreCase, String ignoreToken, boolean fragment,
+  // EditDistanceCostMap edm) {
+  //
+  // // TODO: fragment implementieren, falls es gebraucht wird.
+  //
+  // EditDistanceResultMap resultMap = new EditDistanceResultMap();
+  //
+  // // Delete.
+  // if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
+  // resultMap.putAll(editDistance(node, query, result, distance
+  // - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, ignoreToken, fragment,
+  // edm));
+  // }
+  //
+  // // Recursion stop.
+  // if (node.isWordEnd()) {
+  //
+  // HashMap<String, Set<String>> temp = new HashMap<String, Set<String>>();
+  //
+  // if (query.length() - index <= distance) {
+  // temp.put(result, new HashSet<String>(node.getTypes()));
+  // resultMap.putAll(temp);
+  // }
+  //
+  // // Ignore token at the end of the word.
+  // if (ignoreToken.contains(String.valueOf(node.getValue()))) {
+  // temp.put(result, new HashSet<String>(node.getTypes()));
+  // resultMap.putAll(temp);
+  // }
+  //
+  // if (node.getChildren() == null) {
+  // return resultMap;
+  // }
+  // }
+  //
+  // // Recursion.
+  // for (MultiTextNode tempNode : node.getChildren().values()) {
+  //
+  // if (index < query.length()) {
+  // if (ignoreCase) {
+  // if (Character.toLowerCase(tempNode.getValue()) == Character.toLowerCase(query
+  // .charAt(index))) {
+  // resultMap.putAll(editDistance(tempNode, query, result + tempNode.getValue(), distance,
+  // index + 1, ignoreCase, ignoreToken, fragment, edm));
+  // }
+  // } else {
+  // if (tempNode.getValue() == query.charAt(index)) {
+  // resultMap.putAll(editDistance(tempNode, query, result + tempNode.getValue(), distance,
+  // index + 1, ignoreCase, ignoreToken, fragment, edm));
+  // }
+  // }
+  // }
+  //
+  // if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {
+  //
+  // // Substitute.
+  // resultMap.putAll(editDistance(tempNode, query, result + tempNode.getValue(), distance
+  // - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1, ignoreCase,
+  // ignoreToken, fragment, edm));
+  // }
+  //
+  // // Ignore token.
+  // if (ignoreToken.contains(String.valueOf(tempNode.getValue()))) {
+  // resultMap.putAll(editDistance(tempNode, query, result + tempNode.getValue(), distance,
+  // index, ignoreCase, ignoreToken, fragment, edm));
+  // } else {
+  // if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
+  // // Insert - use the same index twice.
+  // resultMap.putAll(editDistance(tempNode, query, result + tempNode.getValue(), distance
+  // - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, ignoreToken,
+  // fragment, edm));
+  // }
+  // }
+  // }
+  //
+  // return resultMap;
+  // }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((costMap == null) ? 0 : costMap.hashCode());
+    result = prime * result + ((root == null) ? 0 : root.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    MultiTreeWordList other = (MultiTreeWordList) obj;
+    if (costMap == null) {
+      if (other.costMap != null)
+        return false;
+    } else if (!costMap.equals(other.costMap))
+      return false;
+    if (root == null) {
+      if (other.root != null)
+        return false;
+    } else if (!root.equals(other.root))
+      return false;
+    return true;
+  }
+
+  public void createMTWLFile(String path) {
+    persistence.createMTWLFile(root, path);
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordList.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java Mon Aug  1 15:24:44 2011
@@ -0,0 +1,95 @@
+package org.apache.uima.tm.textmarker.resource.trie;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+public class MultiTreeWordListPersistence {
+
+  /**
+   * 
+   * Reads the XML-File with the specified path and creates a TreeWordList.
+   * 
+   * @param path
+   *          The location of the XML-File.
+   */
+  public void readMTWL(MultiTextNode root, String path) {
+    readMTWL(root, path, "UTF-8");
+  }
+
+  public void readMTWL(MultiTextNode root, String path, String encoding) {
+    try {
+      FileInputStream input = new FileInputStream(path);
+      InputStreamReader stream = new InputStreamReader(input, encoding);
+      TrieXMLEventHandler handler = new TrieXMLEventHandler(root);
+      SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+      SAXParser saxParser = saxParserFactory.newSAXParser();
+      XMLReader reader = saxParser.getXMLReader();
+      // was:
+      // XMLReader reader = XMLReaderFactory.createXMLReader();
+      reader.setContentHandler(handler);
+      reader.setErrorHandler(handler);
+      reader.parse(new InputSource(stream));
+    } catch (IOException e) {
+      e.printStackTrace();
+    } catch (SAXException e) {
+      e.printStackTrace();
+    } catch (ParserConfigurationException e) {
+      e.printStackTrace();
+    }
+  }
+
+  public void createMTWLFile(MultiTextNode root, String path) {
+    createMTWLFile(root, path, "UTF-8");
+  }
+
+  public void createMTWLFile(MultiTextNode root, String path, String encoding) {
+    try {
+      FileOutputStream output = new FileOutputStream(path);
+      OutputStreamWriter writer = new OutputStreamWriter(output, encoding);
+      writer.write("<?xml version=\"1.0\" ?><root>");
+      for (MultiTextNode node : root.getChildren().values()) {
+        writeTextNode(writer, node);
+      }
+      writer.write("</root>");
+      writer.close();
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  private void writeTextNode(Writer writer, MultiTextNode node) {
+    try {
+      writer.write("\n");
+      // String s = "<n e=\"" + Boolean.toString(node.isWordEnd()) + "\">";
+      String s = "<n>";
+      writer.write(s);
+      writer.write("<c><![CDATA[" + node.getValue() + "]]></c>");
+      if (Boolean.valueOf(node.isWordEnd())) {
+        for (String type : node.getTypes()) {
+          String t = "<t>" + type + "</t>";
+          writer.write(t);
+        }
+      }
+      for (MultiTextNode child : node.getChildren().values()) {
+        writeTextNode(writer, child);
+      }
+      writer.write("</n>");
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/MultiTreeWordListPersistence.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java Mon Aug  1 15:24:44 2011
@@ -0,0 +1,94 @@
+package org.apache.uima.tm.textmarker.resource.trie;
+
+import java.util.Stack;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TrieXMLEventHandler extends DefaultHandler {
+
+  private Stack<MultiTextNode> stack;
+
+  // boolean listeningState;
+
+  boolean inContent;
+
+  boolean inType;
+
+  public TrieXMLEventHandler(MultiTextNode root) {
+    super();
+    this.stack = new Stack<MultiTextNode>();
+    stack.add(root);
+    // this.listeningState = false;
+  }
+
+  @Override
+  public void startDocument() {
+  }
+
+  @Override
+  public void endDocument() {
+  }
+
+  @Override
+  public void startElement(String namespaceURI, String localName, String qualifiedName,
+          Attributes atts) {
+    if ("n".equals(localName) || "n".equals(qualifiedName)) {
+      // char c = atts.getValue("c").charAt(0);
+      // boolean e = Boolean.valueOf(atts.getValue("e"));
+      MultiTextNode newNode = new MultiTextNode();
+      // newNode.setWordEnd(e);
+      // stack.peek().addChild(newNode);
+      stack.add(newNode);
+      inContent = false;
+      inType = false;
+    }
+    if ("t".equals(localName) || "t".equals(qualifiedName)) {
+      inType = true;
+      inContent = false;
+    }
+    if ("c".equals(localName) || "c".equals(qualifiedName)) {
+      // listeningState = true;
+      inType = false;
+      inContent = true;
+    }
+  }
+
+  @Override
+  public void endElement(String namespaceURI, String localName, String qualifiedName) {
+    if ("n".equals(localName) || "n".equals(qualifiedName)) {
+      MultiTextNode pop = stack.pop();
+      stack.peek().addChild(pop);
+    }
+    if ("t".equals(localName) || "t".equals(qualifiedName)) {
+      inType = false;
+    }
+    if ("c".equals(localName) || "c".equals(qualifiedName)) {
+      inContent = false;
+    }
+  }
+
+  @Override
+  public void characters(char ch[], int start, int length) {
+    // if (listeningState == true) {
+    if (stack.isEmpty()) {
+      return;
+    }
+    MultiTextNode peek = stack.peek();
+    if (inType) {
+      StringBuilder type = new StringBuilder();
+      for (int i = start; i < start + length; i++) {
+        type.append(String.valueOf(ch[i]));
+      }
+      peek.addType(type.toString());
+    } else if (inContent) {
+      if (ch.length > 0) {
+        peek.setValue(ch[0]);
+      } else {
+        peek.setValue(' ');
+      }
+    }
+    // }
+
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/resource/trie/TrieXMLEventHandler.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy (2) of SeedLexer.flex
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy%20%282%29%20of%20SeedLexer.flex?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy (2) of SeedLexer.flex (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy (2) of SeedLexer.flex Mon Aug  1 15:24:44 2011
@@ -0,0 +1,215 @@
+package org.apache.uima.tm.textmarker.scanner;
+import java.util.*;
+import java.util.regex.*;
+
+import org.apache.uima.jcas.JCas;
+
+import org.apache.uima.tm.textmarker.kernel.type.TextMarkerBasic;
+import org.apache.uima.tm.type.AMP;
+import org.apache.uima.tm.type.BREAK;
+import org.apache.uima.tm.type.CAP;
+import org.apache.uima.tm.type.COLON;
+import org.apache.uima.tm.type.COMMA;
+import org.apache.uima.tm.type.CW;
+import org.apache.uima.tm.type.EXCLAMATION;
+import org.apache.uima.tm.type.MARKUP;
+import org.apache.uima.tm.type.NBSP;
+import org.apache.uima.tm.type.NUM;
+import org.apache.uima.tm.type.PERIOD;
+import org.apache.uima.tm.type.QUESTION;
+import org.apache.uima.tm.type.SEMICOLON;
+import org.apache.uima.tm.type.SPACE;
+import org.apache.uima.tm.type.SPECIAL;
+import org.apache.uima.tm.type.SW;
+
+%%
+
+%{
+    private int number = 0;
+
+    private Map<String,String> tags = new HashMap<String,String>();
+    private JCas cas;
+    private final static Pattern tagPattern =
+        Pattern.compile("</?(\\w+)([^>]*)>");
+    private String splitAndPutInMap(String tag){
+        Matcher m = tagPattern.matcher(tag);
+        if(m.find()){
+            String name = m.group(1).toLowerCase();
+            tags.put(name,m.group(2));
+            return name;
+        } else {
+            return null;
+        }
+    }   
+    private void removeTag(String closingTag){
+        String cTag = closingTag.replace("</","");
+        cTag = cTag.replace(">","").toLowerCase();
+        tags.remove(cTag.trim());
+    }
+    public void setJCas(JCas cas) {
+        this.cas = cas;
+    }
+%}
+
+%unicode
+%line
+%char
+%type TextMarkerBasic
+%class SeedLexer
+
+ALPHA=[A-Za-z]
+DIGIT=[0-9]
+WHITE_SPACE_CHAR=[\n\r\ \t\b\012]
+BREAK=[\n\r\b\012]
+SPACE=[ \t]
+
+%%
+
+
+<YYINITIAL> {
+    
+    \<[/]\w+[^>]*\> {
+                removeTag(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                    
+    \<\w+[^>]*\> {
+                String tag = splitAndPutInMap(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                
+    \xA0|&nbsp;|&NBSP; {
+                NBSP t = new NBSP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    &{ALPHA}+; {
+                AMP t = new AMP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {BREAK} {
+                BREAK t = new BREAK(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {SPACE} {
+                SPACE t = new SPACE(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    ":" {
+                COLON t = new COLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "," {
+                COMMA t = new COMMA(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "." {
+                PERIOD t = new PERIOD(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    
+    "!" {
+                EXCLAMATION t = new EXCLAMATION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;    
+    }
+
+    ";" {
+                SEMICOLON t = new SEMICOLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "?" {
+                QUESTION t = new QUESTION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:lowercase:]+ {
+                SW t = new SW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:][:lowercase:]* {
+                CW t = new CW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:]+ {
+                CAP t = new CAP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {DIGIT}+ {
+                NUM t = new NUM(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    . {
+                SPECIAL t = new SPECIAL(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    <<EOF>> {
+                return null;
+    }
+
+}
+

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy (2) of SeedLexer.flex
------------------------------------------------------------------------------
    svn:executable = *

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy of SeedLexer.flex
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy%20of%20SeedLexer.flex?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy of SeedLexer.flex (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy of SeedLexer.flex Mon Aug  1 15:24:44 2011
@@ -0,0 +1,236 @@
+package org.apache.uima.tm.textmarker.scanner;
+import java.util.*;
+import java.util.regex.*;
+
+import org.apache.uima.jcas.JCas;
+
+import org.apache.uima.tm.textmarker.kernel.type.TextMarkerBasic;
+import org.apache.uima.tm.type.AMP;
+import org.apache.uima.tm.type.BREAK;
+import org.apache.uima.tm.type.CAP;
+import org.apache.uima.tm.type.COLON;
+import org.apache.uima.tm.type.COMMA;
+import org.apache.uima.tm.type.CW;
+import org.apache.uima.tm.type.EXCLAMATION;
+import org.apache.uima.tm.type.MARKUP;
+import org.apache.uima.tm.type.NBSP;
+import org.apache.uima.tm.type.NUM;
+import org.apache.uima.tm.type.PERIOD;
+import org.apache.uima.tm.type.QUESTION;
+import org.apache.uima.tm.type.SEMICOLON;
+import org.apache.uima.tm.type.SPACE;
+import org.apache.uima.tm.type.SPECIAL;
+import org.apache.uima.tm.type.SW;
+
+%%
+
+%{
+    private int number = 0;
+    private int comment_count = 0;
+    private Map<String,String> tags = new HashMap<String,String>();
+    private JCas cas;
+    private final static Pattern tagPattern =
+        Pattern.compile("</?(\\w+)([^>]*)>");
+    private String splitAndPutInMap(String tag){
+        Matcher m = tagPattern.matcher(tag);
+        if(m.find()){
+            String name = m.group(1).toLowerCase();
+            tags.put(name,m.group(2));
+            return name;
+        } else {
+            return null;
+        }
+    }   
+    private void removeTag(String closingTag){
+        String cTag = closingTag.replace("</","");
+        cTag = cTag.replace(">","").toLowerCase();
+        tags.remove(cTag.trim());
+    }
+    public void setJCas(JCas cas) {
+        this.cas = cas;
+    }
+%}
+
+%unicode
+%line
+%char
+%state COMMENT
+%type TextMarkerBasic
+%class SeedLexer
+
+ALPHA=[A-Za-z]
+DIGIT=[0-9]
+WHITE_SPACE_CHAR=[\n\r\ \t\b\012]
+BREAK=[\n\r\b\012]
+SPACE=[ \t]
+COMMENT_TEXT=([^*/]|[^*]"/"[^*]|[^/]"*"[^/]|"*"[^/]|"/"[^*])*
+%%
+
+
+<YYINITIAL> {
+    "/*" {
+                yybegin(COMMENT);
+                comment_count++;
+    }
+    
+    \<[/][^>]+\> {
+                removeTag(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                    
+    \<[^>]+\> {
+                String tag = splitAndPutInMap(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                
+    \xA0|&nbsp;|&NBSP; {
+                NBSP t = new NBSP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    &{ALPHA}+; {
+                AMP t = new AMP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {BREAK} {
+                BREAK t = new BREAK(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {SPACE} {
+                SPACE t = new SPACE(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    ":" {
+                COLON t = new COLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "," {
+                COMMA t = new COMMA(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "." {
+                PERIOD t = new PERIOD(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    
+    "!" {
+                EXCLAMATION t = new EXCLAMATION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;    
+    }
+
+    ";" {
+                SEMICOLON t = new SEMICOLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "?" {
+                QUESTION t = new QUESTION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:lowercase:]+ {
+                SW t = new SW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:][:lowercase:]* {
+                CW t = new CW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:]+ {
+                CAP t = new CAP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {DIGIT}+ {
+                NUM t = new NUM(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    . {
+                SPECIAL t = new SPECIAL(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    <<EOF>> {
+                SPECIAL t = new SPECIAL(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+}
+
+<COMMENT> {
+    "/*" { comment_count++; }
+    "*/" { if (--comment_count == 0) yybegin(YYINITIAL); }
+    {COMMENT_TEXT} { }
+    <<EOF>> {
+                SPECIAL t = new SPECIAL(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+}
\ No newline at end of file

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/Copy of SeedLexer.flex
------------------------------------------------------------------------------
    svn:executable = *

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java Mon Aug  1 15:24:44 2011
@@ -0,0 +1,33 @@
+package org.apache.uima.tm.textmarker.seed;
+
+import java.io.BufferedReader;
+import java.io.StringReader;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class DefaultSeeder implements TextMarkerAnnotationSeeder {
+
+  public void seed(String text, CAS cas) {
+    BufferedReader reader = new BufferedReader(new StringReader(text));
+    final SeedLexer sourceLexer = new SeedLexer(reader);
+    try {
+      sourceLexer.setJCas(cas.getJCas());
+    } catch (CASException e1) {
+    }
+    Annotation a = null;
+
+    try {
+      a = sourceLexer.yylex();
+    } catch (Exception e) {
+    }
+    while (a != null) {
+      a.addToIndexes();
+      try {
+        a = sourceLexer.yylex();
+      } catch (Exception e) {
+      }
+    }
+  }
+}
\ No newline at end of file

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/DefaultSeeder.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/SeedLexer.flex
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/SeedLexer.flex?rev=1152824&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/SeedLexer.flex (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/SeedLexer.flex Mon Aug  1 15:24:44 2011
@@ -0,0 +1,233 @@
+package org.apache.uima.tm.textmarker.scanner;
+import java.util.*;
+import java.util.regex.*;
+
+import org.apache.uima.jcas.JCas;
+
+import org.apache.uima.tm.textmarker.kernel.type.TextMarkerBasic;
+import org.apache.uima.tm.type.AMP;
+import org.apache.uima.tm.type.BREAK;
+import org.apache.uima.tm.type.CAP;
+import org.apache.uima.tm.type.COLON;
+import org.apache.uima.tm.type.COMMA;
+import org.apache.uima.tm.type.CW;
+import org.apache.uima.tm.type.EXCLAMATION;
+import org.apache.uima.tm.type.MARKUP;
+import org.apache.uima.tm.type.NBSP;
+import org.apache.uima.tm.type.NUM;
+import org.apache.uima.tm.type.PERIOD;
+import org.apache.uima.tm.type.QUESTION;
+import org.apache.uima.tm.type.SEMICOLON;
+import org.apache.uima.tm.type.SPACE;
+import org.apache.uima.tm.type.SPECIAL;
+import org.apache.uima.tm.type.SW;
+
+%%
+
+%{
+    private int number = 0;
+
+    private Map<String,String> tags = new HashMap<String,String>();
+    private JCas cas;
+    private final static Pattern tagPattern =
+        Pattern.compile("</?(\\w+)([^>]*)>");
+    private String splitAndPutInMap(String tag){
+        Matcher m = tagPattern.matcher(tag);
+        if(m.find()){
+            String name = m.group(1).toLowerCase();
+            tags.put(name,m.group(2));
+            return name;
+        } else {
+            return "!";
+        }
+    }   
+    private void removeTag(String closingTag){
+        String cTag = closingTag.replace("</","");
+        cTag = cTag.replace(">","").toLowerCase();
+        tags.remove(cTag.trim());
+    }
+    public void setJCas(JCas cas) {
+        this.cas = cas;
+    }
+%}
+
+%unicode
+%line
+%char
+%type TextMarkerBasic
+%class SeedLexer
+
+ALPHA=[A-Za-z]
+DIGIT=[0-9]
+WHITE_SPACE_CHAR=[\n\r\ \t\b\012]
+BREAK=[\n\r\b\012]
+SPACE=[ \t]
+
+%%
+
+
+<YYINITIAL> {
+    
+    \<[/][!][^>]*> {
+                removeTag(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                    
+    \<[!][^>]*> {
+                String tag = splitAndPutInMap(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+    
+    \<[/][A-Za-z][A-Za-z0-9]*[^>]*> {
+                removeTag(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                    
+    \<[A-Za-z][A-Za-z0-9]*[^>]*> {
+                String tag = splitAndPutInMap(yytext());
+                MARKUP t = new MARKUP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+                
+    \xA0|&nbsp;|&NBSP; {
+                NBSP t = new NBSP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    &{ALPHA}+; {
+                AMP t = new AMP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {BREAK} {
+                BREAK t = new BREAK(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {SPACE} {
+                SPACE t = new SPACE(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    ":" {
+                COLON t = new COLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "," {
+                COMMA t = new COMMA(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "." {
+                PERIOD t = new PERIOD(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    
+    "!" {
+                EXCLAMATION t = new EXCLAMATION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;    
+    }
+
+    ";" {
+                SEMICOLON t = new SEMICOLON(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    "?" {
+                QUESTION t = new QUESTION(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:lowercase:]+ {
+                SW t = new SW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:][:lowercase:]* {
+                CW t = new CW(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    [:uppercase:]+ {
+                CAP t = new CAP(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    {DIGIT}+ {
+                NUM t = new NUM(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    . {
+                SPECIAL t = new SPECIAL(cas);
+                t.setBegin(yychar);
+                t.setEnd(yychar + yytext().length());
+                t.setTags(tags);
+                return t;
+    }
+
+    <<EOF>> {
+                return null;
+    }
+
+}
+

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textmarker.engine/src/main/java/org/apache/uima/tm/textmarker/seed/SeedLexer.flex
------------------------------------------------------------------------------
    svn:executable = *