You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/01 16:21:35 UTC

svn commit: r1152792 [7/10] - in /uima/sandbox/trunk/TextMarker: org.apache.uima.tm.textruler.lp2/ org.apache.uima.tm.textruler.lp2/META-INF/ org.apache.uima.tm.textruler.lp2/bin/ org.apache.uima.tm.textruler.lp2/src/ org.apache.uima.tm.textruler.lp2/s...

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,468 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.tm.textmarker.engine.TextMarkerEngine;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.tm.textruler.extension.TextRulerLearner;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.tm.textruler.tools.MemoryWatch;
+import org.apache.uima.util.FileUtils;
+import org.eclipse.core.runtime.IPath;
+import org.eclipse.core.runtime.Path;
+
+
+/**
+ * 
+ * This class provides basic and shared functionality for all the implemented ML algorithms. New
+ * algorithms can subclass this class and use the whole framework for faster development.
+ * 
+ */
+public abstract class TextRulerBasicLearner implements TextRulerLearner, CasCacheLoader {
+
+  protected TextRulerLearnerDelegate delegate;
+
+  protected AnalysisEngine ae;
+
+  protected TextRulerExampleDocumentSet exampleDocuments;
+
+  protected String inputDirectory;
+
+  protected String tempDirectory;
+
+  protected String preprocessorTMFile;
+
+  protected Set<String> filterSet;
+
+  protected Set<String> filterSetWithSlotNames;
+
+  protected String[] slotNames;
+
+  protected CasCache casCache;
+
+  protected CAS algTestCAS;
+
+  public TextRulerBasicLearner(String inputDir, String prePropTMFile, String tmpDir,
+          String[] slotNames, Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+    super();
+    this.preprocessorTMFile = prePropTMFile;
+    this.tempDirectory = tmpDir;
+    this.slotNames = slotNames;
+    this.inputDirectory = inputDir;
+    this.delegate = delegate;
+    this.filterSet = filterSet;
+    filterSetWithSlotNames = new HashSet<String>(filterSet);
+    for (String s : slotNames) {
+      filterSetWithSlotNames.add(s);
+      filterSetWithSlotNames.add(TextRulerTarget.getSingleSlotTypeName(
+              MLTargetType.SINGLE_LEFT_BOUNDARY, s));
+      filterSetWithSlotNames.add(TextRulerTarget.getSingleSlotTypeName(
+              MLTargetType.SINGLE_RIGHT_BOUNDARY, s));
+    }
+
+    this.casCache = new CasCache(100, this); // TODO make size configurable
+    // !? share e.g. 100 places for
+    // all running algoritghms ?
+  }
+
+  protected String tempDirectory() {
+    return TextRulerToolkit.addTrailingSlashToPath(tempDirectory);
+  }
+
+  protected boolean shouldAbort() {
+    if (delegate != null)
+      return delegate.shouldAbort();
+    else
+      return false;
+  }
+
+  public AnalysisEngine getAnalysisEngine() {
+    if (ae == null) {
+      String descriptorFile = TextRulerToolkit.getEngineDescriptorFromTMSourceFile(new Path(
+              preprocessorTMFile));
+      sendStatusUpdateToDelegate("loading AE...", TextRulerLearnerState.ML_INITIALIZING, false);
+      ae = TextRulerToolkit.loadAnalysisEngine(descriptorFile);
+
+      // set filters to NO filtering so that we can add it manually with
+      // the FILTERTYPE expression!
+      ae.setConfigParameterValue(TextMarkerEngine.DEFAULT_FILTERED_MARKUPS, new String[0]);
+      String tempRulesFileName = getTempRulesFileName();
+      IPath path = new Path(tempRulesFileName);
+      ae.setConfigParameterValue(TextMarkerEngine.MAIN_SCRIPT, path.removeFileExtension()
+              .lastSegment());
+      ae.setConfigParameterValue(TextMarkerEngine.SCRIPT_PATHS, new String[] { path
+              .removeLastSegments(1).toPortableString() });
+      // ae.setConfigParameterValue(TextMarkerEngine.SEEDERS, new String[] {""});
+      ae.setConfigParameterValue(TextMarkerEngine.ADDITIONAL_SCRIPTS, new String[0]);
+      try {
+        ae.reconfigure();
+      } catch (ResourceConfigurationException e) {
+        e.printStackTrace();
+        return null;
+      }
+    }
+    return ae;
+  }
+
+  protected boolean checkForMandatoryTypes() {
+    // check if all passed slot types are present:
+    CAS someCas = getTestCAS();
+    TypeSystem ts = someCas.getTypeSystem();
+    boolean result = true;
+    List<String> missingTypes = new ArrayList<String>();
+    for (String s : slotNames) {
+      if (ts.getType(s) == null) {
+        missingTypes.add(s);
+        result = false;
+      }
+    }
+    String missingString = "";
+    for (String string : missingTypes) {
+      missingString += string + ", ";
+    }
+    if (!missingString.isEmpty()) {
+      missingString = missingString.substring(0, missingString.length() - 2);
+    }
+    sendStatusUpdateToDelegate("Error: Some Slot- or Helper-Types were not found in TypeSystem: "
+            + missingString, TextRulerLearnerState.ML_ERROR, false);
+    return result;
+  }
+
+  protected boolean createTempDirIfNeccessary() {
+    File dir = new File(tempDirectory());
+    if (dir.exists() && dir.isDirectory())
+      return true;
+    else
+      return dir.mkdir();
+  }
+
+  public void run() {
+    if (createTempDirIfNeccessary()) {
+      getAnalysisEngine(); // be sure that our AE was created...
+
+      if (!checkForMandatoryTypes()) {
+
+      } else {
+        sendStatusUpdateToDelegate("Finding documents...", TextRulerLearnerState.ML_INITIALIZING,
+                false);
+        exampleDocuments = new TextRulerExampleDocumentSet(inputDirectory, casCache);
+        if (!shouldAbort()) {
+          sendStatusUpdateToDelegate("Starting...", TextRulerLearnerState.ML_RUNNING, true);
+
+          try {
+            doRun();
+          } catch (Exception e) {
+            e.printStackTrace();
+            sendStatusUpdateToDelegate("Aborted due to exception!", TextRulerLearnerState.ML_ERROR,
+                    true);
+          }
+
+          if (TextRulerToolkit.DEBUG) {
+            try {
+              File file = new File(tempDirectory() + "results.tm");
+              FileUtils.saveString2File(getResultString(), file);
+            } catch (Exception e) {
+              e.printStackTrace();
+            }
+          }
+          cleanUp();
+        }
+      }
+      casCache.clear();
+      casCache = null;
+      exampleDocuments = null; // clear reference
+      if (algTestCAS != null) {
+        algTestCAS.reset();
+        GlobalCASSource.releaseCAS(algTestCAS); // algTestCAS.release();
+        algTestCAS = null;
+      }
+      if (shouldAbort())
+        sendStatusUpdateToDelegate("Aborted!", TextRulerLearnerState.ML_ABORTED, false);
+    } else {
+      sendStatusUpdateToDelegate("ERROR CREATING TEMPORARY DIRECTORY!",
+              TextRulerLearnerState.ML_ERROR, false);
+    }
+  }
+
+  public CAS loadCAS(String fileName, CAS reuseCAS) {
+    return TextRulerToolkit.readCASfromXMIFile(fileName, ae, reuseCAS);
+  }
+
+  protected void sendStatusUpdateToDelegate(String statusString, TextRulerLearnerState state,
+          boolean ruleBaseChanged) {
+    if (delegate != null)
+      delegate.algorithmStatusUpdate(this, statusString, state, ruleBaseChanged);
+  }
+
+  protected abstract void doRun(); // needs to be implemented by concrete
+
+  // algorithm subclasses !
+
+  protected void cleanUp() {
+
+  }
+
+  public String getTempRulesFileName() {
+    return tempDirectory() + "rules.tm";
+  }
+
+  public String getIntermediateRulesFileName() {
+    return tempDirectory() + "intermediaterules.tm";
+  }
+
+  public void compareOriginalDocumentWithTestCAS(TextRulerExampleDocument originalDoc, CAS testCas,
+          TextRulerTarget target, TextRulerStatisticsCollector c, boolean collectNegativeExamples) {
+    // standard implementation - may be overwritten by concrete subclasses
+    // if needed
+    List<TextRulerExample> originalPositives = originalDoc.getPositiveExamples();
+    List<TextRulerExample> testPositives = originalDoc.createSlotInstancesForCAS(testCas, target,
+            false);
+
+    // TODO if you need false negatives (missing annotations), please
+    // reactivate
+    // the code commented out with FALSENEGATIVES
+
+    for (TextRulerExample e : testPositives) {
+      TextRulerExample coveredExample = TextRulerToolkit.exampleListContainsAnnotation(
+              originalPositives, e.getAnnotation());
+      if (coveredExample != null) {
+        c.addCoveredPositive(coveredExample); // add covered example and
+        // increment positive
+        // counter
+        // FALSENEGATIVES originalPositives.remove(coveredExample);
+      } else {
+        if (collectNegativeExamples) {
+          e.setPositive(false);
+          c.addCoveredNegative(e);
+        } else
+          c.incCoveredNegatives(1);
+      }
+    }
+
+    // FALSENEGATIVES c.incMissingPositives(originalPositives.size());
+  }
+
+  public abstract boolean collectNegativeCoveredInstancesWhenTesting();
+
+  public void testRuleOnDocument(final TextRulerRule rule, final TextRulerExampleDocument doc,
+          final TextRulerStatisticsCollector c) {
+    CAS testCAS = getTestCAS();
+    doc.resetAndFillTestCAS(testCAS, rule.getTarget());
+    testRuleOnDocument(rule, doc, c, testCAS);
+    testCAS.reset();
+  }
+
+  public void testRuleOnDocument(final TextRulerRule rule, final TextRulerExampleDocument doc,
+          final TextRulerStatisticsCollector c, CAS testCas) {
+    if (TextRulerToolkit.DEBUG) {
+      MemoryWatch.watch();
+    }
+    try {
+      rule.saveToRulesFile(getTempRulesFileName());
+      if (TextRulerToolkit.DEBUG) {
+        TextRulerToolkit.writeCAStoXMIFile(testCas, tempDirectory() + "testCas.xmi");
+      }
+      ae.process(testCas);
+      if (TextRulerToolkit.DEBUG) {
+        TextRulerToolkit.writeCAStoXMIFile(testCas, tempDirectory() + "testCasProcessed.xmi");
+      }
+      compareOriginalDocumentWithTestCAS(doc, testCas, rule.getTarget(), c,
+              collectNegativeCoveredInstancesWhenTesting());
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  // if you have many rules to test, please use testRulesOnDocumentSet for
+  // performance issues !!
+  public void testRuleOnDocumentSet(final TextRulerRule rule,
+          final TextRulerExampleDocumentSet documents) {
+    TextRulerStatisticsCollector sum = new TextRulerStatisticsCollector();
+    TextRulerExampleDocument[] sortedDocs = documents.getSortedDocumentsInCacheOptimizedOrder();
+
+    for (TextRulerExampleDocument theDoc : sortedDocs) {
+      testRuleOnDocument(rule, theDoc, sum);
+      if (shouldAbort())
+        break;
+    }
+    rule.setCoveringStatistics(sum);
+  }
+
+  public CAS applyScriptOnDocument(String script, final TextRulerExampleDocument doc,
+          TextRulerTarget target) {
+    String tempRulesFileName = getTempRulesFileName();
+    try {
+      FileUtils.saveString2File(script, new File(tempRulesFileName));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    AnalysisEngine analysisEngine = getAnalysisEngine();
+    CAS testCAS = getTestCAS();
+    doc.resetAndFillTestCAS(testCAS, target);
+    try {
+      analysisEngine.process(testCAS);
+    } catch (AnalysisEngineProcessException e) {
+      e.printStackTrace();
+    }
+    return testCAS;
+  }
+
+  public void testRulesOnDocumentSet(final List<? extends TextRulerRule> rules,
+          final TextRulerExampleDocumentSet documents) {
+    if (rules.isEmpty())
+      return;
+    List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>();
+    TextRulerExampleDocument[] sortedDocs = documents.getSortedDocumentsInCacheOptimizedOrder();
+    TextRulerTarget target = rules.get(0).getTarget();
+
+    for (@SuppressWarnings("unused")
+    TextRulerRule r : rules) {
+      // crate a collector for each rule
+      sums.add(new TextRulerStatisticsCollector());
+    }
+
+    CAS theTestCAS = getTestCAS();
+    for (TextRulerExampleDocument theDoc : sortedDocs) {
+      for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
+        TextRulerRule theRule = rules.get(ruleIndex);
+        TextRulerStatisticsCollector sumC = sums.get(ruleIndex);
+
+        if (TextRulerToolkit.DEBUG && !target.equals(theRule.getTarget())) {
+          TextRulerToolkit
+                  .log("[TextRulerBasicLearner.testRulesOnTrainingsSet] ERROR, ALL RULES MUST HAVE THE SAME LEARNING TARGET !");
+        }
+        theDoc.resetAndFillTestCAS(theTestCAS, target);
+        testRuleOnDocument(theRule, theDoc, sumC, theTestCAS);
+        if (shouldAbort())
+          return;
+      }
+    }
+    theTestCAS.reset();
+    // do not release the shared test-cas ! only reset it ! it gets released
+    // at the end of the
+    // whole algorithm !
+    for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++)
+      rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex));
+  }
+
+  public void testRulesOnDocument(final List<? extends TextRulerRule> rules,
+          final TextRulerExampleDocument document) {
+    if (rules.isEmpty())
+      return;
+    List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>();
+    TextRulerTarget target = rules.get(0).getTarget();
+    for (@SuppressWarnings("unused")
+    TextRulerRule r : rules) {
+      // crate a collector for each rule
+      sums.add(new TextRulerStatisticsCollector());
+    }
+    CAS theTestCAS = getTestCAS();
+    for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
+      TextRulerRule theRule = rules.get(ruleIndex);
+      TextRulerStatisticsCollector sumC = sums.get(ruleIndex);
+
+      if (TextRulerToolkit.DEBUG && !target.equals(theRule.getTarget())) {
+        TextRulerToolkit
+                .log("[TextRulerBasicLearner.testRulesOnTrainingsSet] ERROR, ALL RULES MUST HAVE THE SAME LEARNING TARGET !");
+      }
+      document.resetAndFillTestCAS(theTestCAS, target);
+      testRuleOnDocument(theRule, document, sumC, theTestCAS);
+      if (shouldAbort())
+        return;
+    }
+    theTestCAS.reset();
+    // do not release the shared test-cas ! only reset it ! it gets released
+    // at the end of the
+    // whole algorithm !
+    for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++)
+      rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex));
+  }
+
+  public String getTMFileHeaderString() {
+    return getTMPackageString() + getTypeSystemImport() + getTMFilterCommandString();
+  }
+
+  private String getTypeSystemImport() {
+    return "TYPESYSTEM " + getTypeSystemString(preprocessorTMFile) + ";\n\n";
+  }
+
+  private String getTypeSystemString(String fileString) {
+    File file = new File(fileString);
+    // TODO
+
+    return "org.apache.uima.tm.citie.CompleteTypeSystemTypeSystem";
+  }
+
+  public String getTMPackageString() {
+    return "PACKAGE org.apache.uima.tm;\n\n";
+  }
+
+  public String getTMFilterCommandString() {
+    if (filterSet != null && filterSet.size() > 0) {
+      String fs = "";
+      for (String s : filterSet)
+        if (fs.length() == 0)
+          fs += TextRulerToolkit.getTypeShortName(s);
+        else
+          fs += ", " + TextRulerToolkit.getTypeShortName(s);
+
+      return "Document{->FILTERTYPE(" + fs + ")};\n\n";
+    } else
+      return "";
+  }
+
+  public CAS getTestCAS() {
+    // one big memory problem occured as we .reset+.release old CASes and
+    // created new ones
+    // for every test and (e.g. in CasCache for every loaded XMI). Maybe
+    // this is a
+    // UIMA memory issue ? Changing this to an almost static amount of CAS
+    // objects and reusing
+    // them works without leaking, so we prefer this now since it also
+    // brought a performance
+    // boost!
+    if (algTestCAS == null) {
+      try {
+        algTestCAS = GlobalCASSource.allocCAS(ae);
+      } catch (Exception e) {
+        e.printStackTrace();
+        return null;
+      }
+
+    }
+    return algTestCAS;
+  }
+
+  protected void saveParametersToTempFolder(Map<String, Object> params) {
+    if (createTempDirIfNeccessary()) {
+      String str = "\nSettings:\n\n";
+
+      str += "inputDir: " + inputDirectory;
+      str += "\ntempDir: " + tempDirectory;
+      str += "\npreprocessTMFile: " + preprocessorTMFile;
+      str += "\n";
+
+      for (Entry<String, Object> e : params.entrySet()) {
+        str += e.getKey() + " = " + e.getValue() + "\n";
+      }
+      if (createTempDirIfNeccessary())
+        TextRulerToolkit.appendStringToFile(tempDirectory() + "settings.txt", str);
+    }
+  }
+
+  public Set<String> getFilterSet() {
+    return filterSet;
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,129 @@
+package org.apache.uima.tm.textruler.core;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+
+/**
+ * 
+ * TextRulerExample encapsulates a single-slot, multi-slot or single-slot-boundary problem instance.
+ * This can be positive or negative examples from a example document, or it can be coverings of a
+ * rule or multiple rules that were applied to a document...
+ * 
+ *         hint: this could be renamed to MLInstance ?
+ * 
+ */
+public class TextRulerExample {
+
+  protected TextRulerExampleDocument document;
+
+  protected TextRulerAnnotation annotations[]; // single-slot has only one
+
+  // annotation...
+  protected boolean isPositive;
+
+  protected TextRulerTarget target;
+
+  public TextRulerExample(TextRulerExampleDocument document, TextRulerAnnotation annotation,
+          boolean isPositive, TextRulerTarget target) {
+    TextRulerAnnotation singleAnnot[] = { annotation };
+    this.document = document;
+    this.isPositive = isPositive;
+    this.target = target;
+    this.annotations = singleAnnot;
+  }
+
+  public TextRulerExample(TextRulerExampleDocument document, TextRulerAnnotation annotations[],
+          boolean isPositive, TextRulerTarget target) {
+    this.document = document;
+    this.isPositive = isPositive;
+    this.target = target;
+    this.annotations = annotations;
+  }
+
+  public TextRulerExampleDocument getDocument() {
+    return document;
+  }
+
+  public CAS getDocumentCAS() {
+    return document.getCAS();
+  }
+
+  public TextRulerAnnotation getAnnotation() {
+    return annotations[0];
+  }
+
+  public TextRulerAnnotation[] getAnnotations() {
+    return annotations;
+  }
+
+  public boolean isPositive() {
+    return isPositive;
+  }
+
+  public void setPositive(boolean flag) {
+    isPositive = flag;
+  }
+
+  public TextRulerTarget getTarget() {
+    return target;
+  }
+
+  @Override
+  public String toString() {
+    if (target.type != MLTargetType.MULTI_SLOT) {
+      if (annotations != null) {
+        if (target.type == MLTargetType.SINGLE_WHOLE_SLOT)
+          return getAnnotation().getCoveredText();
+        else
+          return "START at " + getAnnotation().getBegin(); // +","+getAnnotation().getEnd();
+      } else
+        return "<no text>";
+    } else {
+      String str = "";
+      for (TextRulerAnnotation a : annotations) {
+        if (a == null)
+          str += "<NULL>";
+        else
+          str += a.getType().getShortName() + ":" + a.getCoveredText() + ";";
+      }
+      return str;
+    }
+  }
+
+  @Override
+  public boolean equals(Object ob) {
+    TextRulerExample o = (TextRulerExample) ob;
+
+    boolean result = document.getCasFileName().equals(o.document.getCasFileName())
+            && (isPositive == o.isPositive) && target.equals(o.target);
+
+    if (!result)
+      return false;
+
+    if (annotations.length != o.annotations.length)
+      return false;
+
+    for (int i = 0; i < annotations.length; i++) {
+      if (!annotations[i].equals(o.annotations[i]))
+        return false;
+    }
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = document.getCasFileName().hashCode()
+            * (isPositive ? 2 : 1)
+            * (target.type == MLTargetType.MULTI_SLOT ? 1
+                    : (target.type == MLTargetType.SINGLE_WHOLE_SLOT ? 2
+                            : (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? 3 : 4)));
+    int i = 1;
+    for (TextRulerAnnotation a : annotations) {
+      result *= i * (a.getBegin() + 1) * (a.getEnd() + 1);
+      i++;
+    }
+
+    return result;
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,338 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+
+/**
+ * 
+ * TextRulerExampleDocument stands for one document usually loaded from an XMI file. It uses the
+ * given CasCache for storing its CAS with the XMI filename as the key.
+ * 
+ * It holds ArrayLists for positive and negative MLExamples which can be filled on demand for a
+ * given learning target. E.g. single slot algorithms learn rules for each slot separately, so the
+ * work-flow is to clear the current examples and create new for the next slot target. The same is
+ * with single slot boundary algorithms like LP2: It first creates all left boundary examples,
+ * learns from them, clears the examples and creates the right boundary examples and so on.
+ * 
+ * This class also provides the functionality extract and created MLExmaples of a given document or
+ * test CAS for a given TextRulerTarget.
+ * 
+ * Especially for boundary algorithms you can call createBoundaryAnnotationsForCas to get boundary
+ * annotations at the beginnings and endings of an example slot.
+ * 
+ * Caution (this is quite a bit inconvenient at the moment!): If a CAS gets loaded from the
+ * casCache, you have to call createBoundaryAnnotationsForCas again, so your casLoader must be aware
+ * of that (see BasicLP2 for an example) !
+ * 
+ *         hint: this could be renamed to MLDocument instead of TextRulerExampleDocument ?
+ */
+public class TextRulerExampleDocument {
+
+  protected String casFileName;
+
+  protected CasCache casCache;
+
+  protected List<TextRulerExample> positiveExamples = new ArrayList<TextRulerExample>();
+
+  protected List<TextRulerExample> negativeExamples = new ArrayList<TextRulerExample>();
+
+  public TextRulerExampleDocument(String casFileName, CasCache casCache) {
+    this.casCache = casCache;
+    this.casFileName = casFileName;
+  }
+
+  public CAS getCAS() {
+    // ask CACHE
+    return casCache.getCAS(casFileName);
+  }
+
+  public List<TextRulerExample> getPositiveExamples() {
+    return positiveExamples;
+  }
+
+  public List<TextRulerExample> getNegativeExamples() {
+    return negativeExamples;
+  }
+
+  protected void createPositiveExamplesForTarget(TextRulerTarget target) {
+    positiveExamples = createSlotInstancesForCAS(getCAS(), target, true);
+  }
+
+  public List<TextRulerExample> createSlotInstancesForCAS(CAS aCas, TextRulerTarget target,
+          boolean createFromRawTypeName) {
+    List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+
+    if (target.isMultiSlot()) {
+      TypeSystem ts = aCas.getTypeSystem();
+      int currentSlotIndex = 0;
+      TextRulerAnnotation[] currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+      List<Type> slotTypes = new ArrayList<Type>();
+      for (String s : target.slotNames)
+        slotTypes.add(ts.getType(s));
+
+      for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator(true); it.isValid(); it
+              .moveToNext()) {
+        AnnotationFS fs = (AnnotationFS) it.get();
+        Type theType = fs.getType();
+        if (slotTypes.contains(theType)) {
+          int idx = slotTypes.indexOf(theType);
+          if (idx < currentSlotIndex) // the previous example was not
+          // complete, so we have to write
+          // it down:
+          {
+            result.add(new TextRulerExample(this, currentAnnotations, true, target));
+            currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+          }
+          currentAnnotations[idx] = new TextRulerAnnotation(fs, this);
+          if (idx >= target.slotNames.length - 1) {
+            result.add(new TextRulerExample(this, currentAnnotations, true, target));
+            currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+            currentSlotIndex = 0;
+          } else
+            currentSlotIndex = idx + 1;
+        }
+      }
+      if (currentSlotIndex > 0) {
+        result.add(new TextRulerExample(this, currentAnnotations, true, target));
+      }
+
+    } else if (target.isLeftCorrection() || target.isRightCorrection()) {
+      // TODO
+      TextRulerBasicLearner learner = target.getLearner();
+      Set<String> filterSet = learner.getFilterSet();
+      CAS testCAS = learner.getTestCAS();
+      TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
+      resetAndFillTestCAS(testCAS, target);
+      CAS docCAS = getCAS();
+      TypeSystem ts = docCAS.getTypeSystem();
+      Type tokensRootType = ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+      AnalysisEngine analysisEngine = learner.getAnalysisEngine();
+      try {
+        analysisEngine.process(testCAS);
+      } catch (AnalysisEngineProcessException e) {
+        // TODO add log here
+      }
+      TextRulerTarget newTarget = new TextRulerTarget(target.slotNames, target.getLearner());
+      if (target.isLeftCorrection()) {
+        newTarget.type = TextRulerTarget.MLTargetType.SINGLE_LEFT_BOUNDARY;
+      } else {
+        newTarget.type = TextRulerTarget.MLTargetType.SINGLE_RIGHT_BOUNDARY;
+      }
+      createExamplesForTarget(newTarget);
+      learner.compareOriginalDocumentWithTestCAS(this, testCAS, newTarget, c, true);
+      List<TextRulerExample> correctTags = getPositiveExamples();
+      List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(c
+              .getCoveredNegativeExamples());
+      for (TextRulerExample wrongTag : wrongTags) {
+        // test, if there's a corresponding positive example
+        // somewhere around (within maxDistance)
+        List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag
+                .getAnnotation().getBegin(), target.getMaxShiftDistance(), TextRulerToolkit
+                .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);
+        List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag
+                .getAnnotation().getEnd(), target.getMaxShiftDistance() + 1, TextRulerToolkit
+                .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);
+
+        right.remove(0);
+
+        // TODO stop after the first found match or create one bad
+        // example for each found occurence ??!!
+        // for now: stop after one ! so create only ONE bad
+        // example...
+        int leftDistance = 0;
+        TextRulerExample leftCorrectTag = null;
+        for (int i = left.size() - 1; i >= 0; i--) {
+          leftDistance++;
+          TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i),
+                  this, target, docCAS.getTypeSystem());
+          // Only checks the beginning of needle
+          leftCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
+                  needle);
+          if (leftCorrectTag != null)
+            break;
+        }
+
+        int rightDistance = 0;
+        TextRulerExample rightCorrectTag = null;
+        for (AnnotationFS fs : right) {
+          rightDistance++;
+          TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, this, target,
+                  docCAS.getTypeSystem());
+          // Only checks the beginning of needle
+          rightCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
+                  needle);
+          if (rightCorrectTag != null)
+            break;
+        }
+
+        TextRulerExample theCorrectTag = null;
+        if (rightDistance < leftDistance && rightCorrectTag != null)
+          theCorrectTag = rightCorrectTag;
+        else if (rightDistance > leftDistance && leftCorrectTag != null)
+          theCorrectTag = leftCorrectTag;
+        else // use the one that would lie in the slot filler:
+        {
+          if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null)
+            theCorrectTag = rightCorrectTag;
+          else
+            theCorrectTag = leftCorrectTag;
+        }
+
+        if (theCorrectTag != null) {
+          TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!");
+          TextRulerShiftExample shiftExample = new TextRulerShiftExample(this, wrongTag
+                  .getAnnotation(), theCorrectTag.getAnnotation(), true, target);
+          result.add(shiftExample);
+        }
+      }
+
+    } else {
+      List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas,
+              createFromRawTypeName ? target.getSingleSlotRawTypeName() : target
+                      .getSingleSlotTypeName()); // do not use
+      // boundary type
+      // here since we
+      // seek for the
+      // orignial slot
+      // !
+      for (AnnotationFS a : slots) {
+        result.add(new TextRulerExample(this, TextRulerToolkit.convertToTargetAnnotation(a, this,
+                target, aCas.getTypeSystem()), true, target));
+      }
+    }
+    return result;
+  }
+
+  protected void createNegativeExamplesForTarget(TextRulerTarget target) {
+    // the default implementation does not support negative examples,
+    // subclasses can overwrite
+    // this if needed... or we could pass this as an argument to the
+    // constructor....
+  }
+
+  public void createExamplesForTarget(TextRulerTarget target) {
+    createPositiveExamplesForTarget(target);
+    createNegativeExamplesForTarget(target);
+  }
+
+  public void clearCurrentExamples() {
+    positiveExamples.clear();
+    negativeExamples.clear();
+  }
+
+  // pass your test CAS object and the corresponding learning target to get a
+  // filled
+  // test-CAS for testing e.g. rule or rule set..
+  // caution: testCas gets reset fist!
+  public void resetAndFillTestCAS(CAS testCas, TextRulerTarget target) {
+    testCas.reset();
+    CAS docCas = getCAS();
+    testCas.setDocumentText(docCas.getDocumentText());
+
+    // copy all annotations except the target-annotations:
+    TypeSystem ts = docCas.getTypeSystem();
+
+    List<Type> slotTypes = new ArrayList<Type>();
+
+    for (String s : target.getSlotTypeNames())
+      slotTypes.add(ts.getType(s));
+
+    if (target.isBoundary()) {
+      // add the base types (without START and END markers) also !
+      for (String s : target.slotNames)
+        slotTypes.add(ts.getType(s));
+    }
+
+    for (FSIterator<AnnotationFS> it = docCas.getAnnotationIndex().iterator(true); it.isValid(); it
+            .moveToNext()) {
+      AnnotationFS fs = it.get();
+      if (!slotTypes.contains(fs.getType())) {
+        Type t = testCas.getTypeSystem().getType(fs.getType().getName());
+        if (t != null) {
+          AnnotationFS createAnnotation = testCas.createAnnotation(t, fs.getBegin(), fs.getEnd());
+          testCas.addFsToIndexes(createAnnotation);
+        } else {
+          TextRulerToolkit.log("Type " + fs.getType().getName() + "is unknown in test CAS");
+        }
+      }
+    }
+  }
+
+  public String getCasFileName() {
+    return casFileName;
+  }
+
+  public static void createBoundaryAnnotationsForCas(CAS aCas, String slotName,
+          Set<String> tokenFilterSet) {
+    List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
+    TypeSystem ts = aCas.getTypeSystem();
+    for (AnnotationFS a : slots) {
+
+      List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
+              .getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
+              tokenFilterSet), ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME));
+      AnnotationFS first = slotTokens.get(0);
+      AnnotationFS last = slotTokens.get(slotTokens.size() - 1);
+      Type typeLB = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+      aCas.addFsToIndexes(aCas.createAnnotation(typeLB, first.getBegin(), first.getEnd()));
+      Type typeRB = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+      aCas.addFsToIndexes(aCas.createAnnotation(typeRB, last.getBegin(), last.getEnd()));
+    }
+  }
+
+  public static void removeBoundaryAnnotationsFromCas(CAS aCas, String slotName) {
+    // this method is not tested yet!
+    TypeSystem ts = aCas.getTypeSystem();
+    Type startType = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+    Type endType = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+    List<AnnotationFS> removeList = new ArrayList<AnnotationFS>();
+    for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(startType).iterator(true); it
+            .isValid(); it.moveToNext()) {
+      AnnotationFS fs = it.get();
+      removeList.add(fs);
+    }
+    for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(endType).iterator(true); it
+            .isValid(); it.moveToNext()) {
+      AnnotationFS fs = it.get();
+      removeList.add(fs);
+    }
+    for (AnnotationFS fs : removeList)
+      aCas.removeFsFromIndexes(fs);
+  }
+
+  public static synchronized TextRulerExample exampleListContainsAnnotation(
+          List<TextRulerExample> list, TextRulerAnnotation ann) {
+    TextRulerExample needle = new TextRulerExample(null, ann, true, null);
+
+    int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() {
+      public int compare(TextRulerExample o1, TextRulerExample o2) {
+        TextRulerAnnotation afs1 = o1.getAnnotation();
+        TextRulerAnnotation afs2 = o2.getAnnotation();
+        if (afs1.getBegin() < afs2.getBegin())
+          return -1;
+        else if (afs1.getBegin() > afs2.getBegin())
+          return 1;
+        else
+          return 0;
+      }
+    });
+    if (index >= 0)
+      return list.get(index);
+    else
+      return null;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,225 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+
+/**
+ * 
+ * TextRulerExampleDocumentSet encapsulates an input set of documents, e.g. examples for a learning
+ * algorithm. It creates an instance of TextRulerExampleDocument for each found XMI file of the
+ * passed input folder
+ * 
+ * For loading CASes you have to provide an CasCache. If you use TextRulerBasicLearner, this is done
+ * for you automatically.
+ * 
+ *         hint: this could be renamed to MLDocumentSet instead of TextRulerExampleDocumentSet ?
+ */
+public class TextRulerExampleDocumentSet {
+
+  protected List<TextRulerExampleDocument> documents;
+
+  protected CasCache casCache;
+
+  public TextRulerExampleDocumentSet(String xmiFolderName, CasCache casCache) {
+    super();
+    documents = new ArrayList<TextRulerExampleDocument>();
+    this.casCache = casCache;
+    File trainingFolder = new File(xmiFolderName);
+    File[] files = trainingFolder.listFiles(new FilenameFilter() {
+      public boolean accept(File dir, String name) {
+        return (name.endsWith(".xmi"));
+      }
+    });
+
+    for (File file : files) {
+      TextRulerToolkit.log("found document XMI file: " + file.getName());
+      documents.add(new TextRulerExampleDocument(file.getAbsolutePath(), casCache));
+    }
+  }
+
+  // for subset creations:
+  protected TextRulerExampleDocumentSet(String[] inputXmiFiles, CasCache casCache) {
+    super();
+    this.casCache = casCache;
+    documents = new ArrayList<TextRulerExampleDocument>();
+    for (String fileName : inputXmiFiles)
+      documents.add(new TextRulerExampleDocument(fileName, casCache));
+  }
+
+  public void createExamplesForTarget(TextRulerTarget target) {
+    TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder();
+    for (TextRulerExampleDocument doc : sortedDocs) {
+      doc.createExamplesForTarget(target);
+    }
+  }
+
+  public void clearCurrentExamples() {
+    for (TextRulerExampleDocument doc : documents)
+      doc.clearCurrentExamples();
+  }
+
+  public Collection<CAS> getCachedCASes() {
+    return casCache.getCachedCASes();
+  }
+
+  public boolean casCacheContainsKey(String key) {
+    return casCache.containsElementWithKey(key);
+  }
+
+  public List<TextRulerExample> getAllExamples() {
+    return getAllExamples(false);
+  }
+
+  public List<TextRulerExample> getAllPositiveExamples() {
+    return getAllExamples(true);
+  }
+
+  public List<TextRulerExample> getAllExamples(boolean onlyPositives) {
+    List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+    for (TextRulerExampleDocument doc : documents) {
+      result.addAll(doc.getPositiveExamples());
+      if (!onlyPositives)
+        result.addAll(doc.getNegativeExamples());
+    }
+    return result;
+  }
+
+  public List<TextRulerExampleDocument> getDocuments() {
+    return documents;
+  }
+
+  public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder(
+          Collection<TextRulerExampleDocument> documents) {
+    Set<TextRulerExampleDocument> docsLeft = new HashSet<TextRulerExampleDocument>(documents);
+    TextRulerExampleDocument[] sortedDocs = new TextRulerExampleDocument[documents.size()];
+
+    // "sort" the currently cached documents to the front of the document
+    // list, so that
+    // we can use them directly and do not have to reload all docs everytime
+    // we come here!
+    int i = 0;
+    for (TextRulerExampleDocument doc : documents) {
+      if (casCacheContainsKey(doc.getCasFileName())) {
+        docsLeft.remove(doc);
+        sortedDocs[i] = doc;
+        i++;
+      }
+    }
+    for (TextRulerExampleDocument doc : docsLeft) {
+      sortedDocs[i] = doc;
+      i++;
+    }
+    if (TextRulerToolkit.DEBUG) {
+      TextRulerToolkit.logIf(i != documents.size(), "ERROR, SIZE MISMATCH!");
+    }
+
+    return sortedDocs;
+  }
+
+  public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder() {
+    return getSortedDocumentsInCacheOptimizedOrder(documents);
+  }
+
+  public List<Integer> getTokenCountHistogrammForSlotName(String slotName, Set<String> filterSet) {
+    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
+    int maxLen = 0;
+
+    TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder(documents);
+
+    for (TextRulerExampleDocument doc : sortedDocs) {
+      CAS aCas = doc.getCAS();
+      List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
+      TypeSystem ts = aCas.getTypeSystem();
+      for (AnnotationFS a : slots) {
+
+        List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
+                .getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
+                filterSet), ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME));
+        int len = slotTokens.size();
+        if (len > maxLen)
+          maxLen = len;
+        Integer key = new Integer(len);
+        int current = map.containsKey(key) ? map.get(key) : 0;
+        map.put(key, len + current);
+      }
+    }
+    List<Integer> resultList = new ArrayList<Integer>(maxLen + 1);
+    for (int i = 0; i <= maxLen; i++) {
+      int value = map.containsKey(i) ? map.get(i) : 0;
+      resultList.add(value);
+    }
+    return resultList;
+  }
+
+  public CAS getCAS(String key) {
+    return casCache.getCAS(key);
+  }
+
+  public int size() {
+    return documents.size();
+  }
+
+  public TextRulerExampleDocument getDocumentForFileName(String fileName) {
+    for (TextRulerExampleDocument doc : documents)
+      if (doc.getCasFileName().equals(fileName))
+        return doc;
+    return null;
+  }
+
+  // TODO this is not tested yet!
+  public List<TextRulerExampleDocumentSet> partitionIntoSubsets(int[] percentages) {
+    List<TextRulerExampleDocumentSet> result = new ArrayList<TextRulerExampleDocumentSet>();
+
+    int sum = 0;
+    for (int p : percentages) {
+      if (p == 0) {
+        TextRulerToolkit
+                .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero!");
+        return null;
+      }
+      sum += p;
+    }
+    if (sum != 100) {
+      TextRulerToolkit
+              .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] percentages has to be 100 in total!");
+      return null;
+    }
+
+    int rest = size();
+    int docIndex = 0;
+
+    for (int i = 0; i < percentages.length; i++) {
+      int partSize;
+      if (i == percentages.length - 1) {
+        partSize = Math.round((((percentages[i] * size()) / 100.0f)));
+        if (partSize == 0)
+          partSize = 1;
+      } else
+        partSize = rest;
+
+      if (partSize == 0) {
+        TextRulerToolkit
+                .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero! too few example documents for your partition?");
+        return null;
+      }
+      String[] fileNames = new String[partSize];
+      for (int doc = 0; doc < partSize; doc++)
+        fileNames[doc] = documents.get(doc + docIndex).getCasFileName();
+      docIndex += partSize;
+      result.add(new TextRulerExampleDocumentSet(fileNames, casCache));
+      rest -= partSize;
+    }
+    return result;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,106 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem.MLRuleItemType;
+
+
+/**
+ * 
+ * TextRulerMultiSlotRule adds multi-slot specific stuff to the basic class TextRulerRule.
+ * 
+ * A multi-slot-rule consists of an TextRulerSlotPattern for each slot which of each consists of
+ * three patterns: prefiller, filler an postfiller (see TextRulerSlotPattern).
+ * 
+ */
+public class TextRulerMultiSlotRule extends TextRulerRule {
+
+  protected List<TextRulerSlotPattern> slotPatterns = new ArrayList<TextRulerSlotPattern>();
+
+  public TextRulerMultiSlotRule(TextRulerMultiSlotRule copyFrom) {
+    super(copyFrom);
+
+    for (TextRulerSlotPattern origP : copyFrom.slotPatterns)
+      slotPatterns.add(origP.copy());
+  }
+
+  public String getMarkName(int slotIndex) {
+    return TextRulerToolkit.getTypeShortName(target.getMultiSlotTypeName(slotIndex));
+  }
+
+  public TextRulerMultiSlotRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+    super(parentAlgorithm, target);
+  }
+
+  protected String getInterslotWildCard() {
+    return "ALL*? ";
+  }
+
+  @Override
+  public void compileRuleString() {
+    String ruleString = "";
+
+    int totalSize = 0;
+    int totalIndex = 0;
+    int interSlotWildcards = slotPatterns.size() - 1;
+    if (interSlotWildcards < 0)
+      interSlotWildcards = 0;
+    for (TextRulerSlotPattern sp : slotPatterns) {
+      totalSize += sp.preFillerPattern.size();
+      totalSize += sp.fillerPattern.size();
+      totalSize += sp.postFillerPattern.size();
+    }
+    totalSize += interSlotWildcards;
+
+    for (int slotIndex = 0; slotIndex < slotPatterns.size(); slotIndex++) {
+      TextRulerSlotPattern sPattern = slotPatterns.get(slotIndex);
+      int index = 0;
+      for (TextRulerRuleItem item : sPattern.preFillerPattern) {
+        ruleString += item.getStringForRuleString(this, MLRuleItemType.PREFILLER, index,
+                sPattern.preFillerPattern.size(), totalIndex, totalSize, slotIndex)
+                + " ";
+        index++;
+        totalIndex++;
+      }
+      index = 0;
+      for (TextRulerRuleItem item : sPattern.fillerPattern) {
+        ruleString += item.getStringForRuleString(this, MLRuleItemType.FILLER, index,
+                sPattern.fillerPattern.size(), totalIndex, totalSize, slotIndex)
+                + " ";
+        index++;
+        totalIndex++;
+      }
+
+      index = 0;
+      for (TextRulerRuleItem item : sPattern.postFillerPattern) {
+        ruleString += item.getStringForRuleString(this, MLRuleItemType.POSTFILLER, index,
+                sPattern.postFillerPattern.size(), totalIndex, totalSize, slotIndex)
+                + " ";
+        index++;
+        totalIndex++;
+      }
+
+      if (slotPatterns.size() > 1 && slotIndex < slotPatterns.size() - 1) {
+        // add interslot wildcard:
+        ruleString += getInterslotWildCard();
+        totalIndex++;
+      }
+    }
+
+    ruleString = ruleString.trim();
+    ruleString += ";";
+    this.ruleString = ruleString;
+    setNeedsCompile(false);
+  }
+
+  public List<TextRulerSlotPattern> getPatterns() {
+    return slotPatterns;
+  }
+
+  @Override
+  public TextRulerMultiSlotRule copy() {
+    return new TextRulerMultiSlotRule(this);
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,107 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+
+import org.apache.uima.util.FileUtils;
+
+/**
+ * 
+ * TextRulerRule is the basic class for any kind of TextMarker-Rule representation for any learning
+ * algorithm. A rule usually has a parent algorithm (that created it) and a learning target
+ * (TextRulerTarget).
+ * 
+ * The subclasses TextRulerSingleSlotRule and TextRulerMultiSlotRule add slot specific issues to it
+ * and every algorithm then has to subclass on of those two and provide a class that implements
+ * TextRulerRuleItem.
+ * 
+ */
+public abstract class TextRulerRule {
+
+  protected TextRulerBasicLearner algorithm;
+
+  protected boolean needsCompile = true;
+
+  protected String ruleString;
+
+  protected TextRulerTarget target;
+
+  protected TextRulerStatisticsCollector coveringStatistics;
+
+  // copy constructor:
+  public TextRulerRule(TextRulerRule copyFrom) {
+    algorithm = copyFrom.algorithm;
+    needsCompile = copyFrom.needsCompile;
+    ruleString = copyFrom.ruleString;
+    target = new TextRulerTarget(copyFrom.target, algorithm);
+    coveringStatistics = copyFrom.coveringStatistics != null ? copyFrom.coveringStatistics.copy()
+            : null;
+  }
+
+  public TextRulerRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+    super();
+    algorithm = parentAlgorithm;
+    this.target = target;
+  }
+
+  public String getRuleString() {
+    if (needsCompile)
+      compileRuleString();
+    return ruleString;
+  }
+
+  public TextRulerTarget getTarget() {
+    return target;
+  }
+
+  public abstract void compileRuleString();
+
+  public void setNeedsCompile(boolean flag) {
+    needsCompile = flag;
+  }
+
+  protected String getRulesFileContent() {
+    return algorithm.getTMFileHeaderString() + getRuleString() + "\n";
+  }
+
+  public void saveToRulesFile(String filename) {
+    File file = new File(filename);
+    String str = getRulesFileContent();
+    try {
+      FileUtils.saveString2File(str, file);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    return this.getRuleString().equals(((TextRulerRule) obj).getRuleString());
+  }
+
+  @Override
+  public int hashCode() {
+    return this.getRuleString().hashCode();
+  }
+
+  public void setCoveringStatistics(TextRulerStatisticsCollector c) {
+    coveringStatistics = c.copy();
+  }
+
+  public TextRulerStatisticsCollector getCoveringStatistics() {
+    return coveringStatistics;
+  }
+
+  public abstract TextRulerRule copy();
+
+  @Override
+  public String toString() {
+    return ruleString == null ? "<not compiled>" : ruleString; // don't
+    // compile if
+    // neccessary!
+    // just
+    // return the
+    // current
+    // rulestring!
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,27 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ * 
+ * Every rule pattern (TextRulerRulePattern) consists of items. Those have to implement the
+ * interface TextRulerRuleItem.
+ * 
+ *         hint: maybe we should change this to an abstract class instead of an interface ?!
+ */
+public interface TextRulerRuleItem {
+
+  public enum MLRuleItemType {
+    PREFILLER, FILLER, POSTFILLER
+  };
+
+  public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+          int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex);
+
+  public boolean equals(TextRulerRuleItem o);
+
+  public TextRulerRuleItem copy();
+
+  public String toString();
+
+  public int hashCode();
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,58 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import org.apache.uima.util.FileUtils;
+
+/**
+ * 
+ * TextRulerRuleList can hold a list of rules and provides some extra functionality like saving them
+ * to a TextMarker rule file...
+ * 
+ */
+public class TextRulerRuleList extends ArrayList<TextRulerRule> {
+
+  private static final long serialVersionUID = 1L;
+
+  public void saveToRulesFile(String filename, String fileHeader) {
+    File file = new File(filename);
+    try {
+      FileUtils.saveString2File(getTMFileString(fileHeader), file);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  public boolean addRule(TextRulerRule rule) {
+    if (!this.contains(rule)) {
+      this.add(rule);
+      return true;
+    }
+    return false;
+  }
+
+  public String getRulesString(String linePrefix) {
+    return getRulesString(linePrefix, Integer.MAX_VALUE);
+  }
+
+  public String getRulesString(String linePrefix, int maxRuleStringLength) {
+    StringBuffer str = new StringBuffer();
+    for (TextRulerRule rule : this) {
+      String theRuleString = rule.getRuleString();
+      String rStr = theRuleString.length() > maxRuleStringLength ? "<too long to display>"
+              : theRuleString;
+      str.append(linePrefix + rStr + "\t// " + rule.getCoveringStatistics() + "\n");
+    }
+    return str.toString();
+  }
+
+  public String getTMFileString(String header) {
+    return header + getRulesString("", Integer.MAX_VALUE);
+  }
+
+  public String getTMFileString(String header, int maxRuleStringLength) {
+    return header + getRulesString("", maxRuleStringLength);
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,80 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+
+/**
+ * 
+ * TextRulerRulePattern is an ordered list of rule items and provides some special functionality for
+ * dealing with rule patterns like finding sub patterns or such.
+ * 
+ *         hint: this is a very basic implementation and could surely be optimized ;-)
+ */
+public class TextRulerRulePattern extends ArrayList<TextRulerRuleItem> {
+
+  private static final long serialVersionUID = 1L;
+
+  @Override
+  public String toString() {
+    String result = "";
+    for (TextRulerRuleItem i : this)
+      result += " " + i;
+    return result.trim();
+  }
+
+  public int find(TextRulerRulePattern subPattern) {
+    if (subPattern.size() == 0)
+      return -1;
+    if (size() < subPattern.size())
+      return -1;
+    int maxIndex = size() - subPattern.size();
+    for (int i = 0; i <= maxIndex; i++)
+      if (get(i).toString().equals(subPattern.get(0).toString())) {
+        // test the rest of the items:
+        boolean isEqual = true;
+        for (int i2 = 1; i2 < subPattern.size(); i2++) {
+          if (!get(i + i2).toString().equals(subPattern.get(i2).toString())) {
+            isEqual = false;
+            break;
+          }
+        }
+        if (isEqual)
+          return i;
+
+      }
+    return -1;
+  }
+
+  public TextRulerRulePattern subPattern(int start, int length) {
+    TextRulerRulePattern result = new TextRulerRulePattern();
+    if (length < 0)
+      length = size();
+    for (int i = 0; i < length; i++) {
+      int theIndex = start + i;
+      if (theIndex >= size())
+        break;
+      result.add(get(theIndex));
+    }
+    return result;
+  }
+
+  public TextRulerRulePattern copy() {
+    TextRulerRulePattern result = new TextRulerRulePattern();
+    for (TextRulerRuleItem i : this)
+      result.add(i.copy());
+    return result;
+  }
+
+  public TextRulerRuleItem lastItem() {
+    if (size() > 0)
+      return get(size() - 1);
+    else
+      return null;
+  }
+
+  public TextRulerRuleItem firstItem() {
+    if (size() > 0)
+      return get(0);
+    else
+      return null;
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,41 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ * 
+ * This class introduces the special additional information of an example for learning
+ * LP2-Correction Rules. It needs besides the original, correct slot boundary annotation the wrong
+ * annotation, or the other way round, it needs the correct annotation where to shift the boundary
+ * tag.
+ * 
+ * Since TextRulerExample provides a possibility to store more than one Annotation for
+ * MultiSlot-Exmaples, we easily can use this storage for those two annotations. But for the sake of
+ * code reading and better understanding, this additional class with named accessors is created.
+ * 
+ */
+public class TextRulerShiftExample extends TextRulerExample {
+
+  public TextRulerShiftExample(TextRulerExampleDocument document,
+          TextRulerAnnotation wrongAnnotation, TextRulerAnnotation correctAnnotation,
+          boolean isPositive, TextRulerTarget target) {
+    super(document, (TextRulerAnnotation[]) null, isPositive, target);
+    annotations = new TextRulerAnnotation[2];
+    annotations[0] = wrongAnnotation;
+    annotations[1] = correctAnnotation;
+  }
+
+  public TextRulerAnnotation wrongAnnotation() {
+    return annotations[0];
+  }
+
+  public TextRulerAnnotation correctAnnotation() {
+    return annotations[1];
+  }
+
+  @Override
+  public String toString() {
+    String wrongStr = "" + wrongAnnotation().getBegin();
+    String correctStr = "" + correctAnnotation().getBegin();
+    return wrongStr + " --> " + correctStr;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,164 @@
+package org.apache.uima.tm.textruler.core;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem.MLRuleItemType;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+/**
+ * 
+ * TextRulerSingleSlotRule adds single-slot specific stuff to the basic class TextRulerRule.
+ * 
+ * A single-slot-rule consists of one TextRulerSlotPattern which each consists of three patterns:
+ * prefiller, filler an postfiller (see TextRulerSlotPattern).
+ * 
+ */
+public class TextRulerSingleSlotRule extends TextRulerRule {
+
+  protected TextRulerSlotPattern slotPattern = new TextRulerSlotPattern();
+
+  public TextRulerSingleSlotRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+    super(parentAlgorithm, target);
+  }
+
+  public TextRulerSingleSlotRule(TextRulerSingleSlotRule copyFrom) {
+    super(copyFrom);
+    slotPattern = copyFrom.slotPattern.copy();
+  }
+
+  public TextRulerRulePattern getPreFillerPattern() {
+    return slotPattern.preFillerPattern;
+  }
+
+  public TextRulerRulePattern getFillerPattern() {
+    return slotPattern.fillerPattern;
+  }
+
+  public TextRulerRulePattern getPostFillerPattern() {
+    return slotPattern.postFillerPattern;
+  }
+
+  public String getMarkName() {
+    return TextRulerToolkit.getTypeShortName(target.getSingleSlotTypeName());
+  }
+
+  @Override
+  public void compileRuleString() {
+    String ruleString = "";
+    int preCount = slotPattern.preFillerPattern.size();
+    int postCount = slotPattern.postFillerPattern.size();
+    int fillerCount = slotPattern.fillerPattern.size();
+    int totalSize = preCount + postCount + fillerCount;
+    int index = 0;
+    int totalIndex = 0;
+    for (TextRulerRuleItem item : slotPattern.preFillerPattern) {
+      ruleString += item.getStringForRuleString(this, MLRuleItemType.PREFILLER, index, preCount,
+              totalIndex, totalSize, 0)
+              + " ";
+      index++;
+      totalIndex++;
+    }
+
+    index = 0;
+    for (TextRulerRuleItem item : slotPattern.fillerPattern) {
+      ruleString += item.getStringForRuleString(this, MLRuleItemType.FILLER, index, fillerCount,
+              totalIndex, totalSize, 0)
+              + " ";
+      index++;
+      totalIndex++;
+    }
+
+    index = 0;
+    for (TextRulerRuleItem item : slotPattern.postFillerPattern) {
+      ruleString += item.getStringForRuleString(this, MLRuleItemType.POSTFILLER, index, postCount,
+              totalIndex, totalSize, 0)
+              + " ";
+      index++;
+      totalIndex++;
+    }
+
+    ruleString = ruleString.trim();
+    ruleString += ";";
+    this.ruleString = ruleString;
+    setNeedsCompile(false);
+  }
+
+  public void addPreFillerItem(TextRulerRuleItem item) {
+    slotPattern.preFillerPattern.add(0, item);
+    setNeedsCompile(true);
+  }
+
+  public void addPreFillerItemWithNormalOrder(TextRulerRuleItem item) {
+    slotPattern.preFillerPattern.add(item);
+    setNeedsCompile(true);
+  }
+
+  public void addPostFillerItem(TextRulerRuleItem item) {
+    slotPattern.postFillerPattern.add(item);
+    setNeedsCompile(true);
+  }
+
+  public void addFillerItem(TextRulerRuleItem item) {
+    if (target.type == MLTargetType.SINGLE_WHOLE_SLOT) {
+      slotPattern.fillerPattern.add(item);
+      setNeedsCompile(true);
+    } else {
+      new Exception("[TextRulerRule] BOUNDARY SLOT RULES CANNOT HAVE FILLER ITEMS!");
+    }
+  }
+
+  public TextRulerRuleItem getOutermostPreFillerItem() {
+    if (slotPattern.preFillerPattern.size() == 0)
+      return null;
+    else
+      return slotPattern.preFillerPattern.get(0);
+  }
+
+  public TextRulerRuleItem getOutermostPostFillerItem() {
+    if (slotPattern.postFillerPattern.size() == 0)
+      return null;
+    else
+      return slotPattern.postFillerPattern.get(slotPattern.postFillerPattern.size() - 1);
+  }
+
+  public void removeOutermostPreFillerItem() {
+    if (slotPattern.preFillerPattern.size() > 0) {
+      slotPattern.preFillerPattern.remove(0);
+      setNeedsCompile(true);
+    }
+  }
+
+  public void removeOutermostPostFillerItem() {
+    if (slotPattern.postFillerPattern.size() > 0) {
+      slotPattern.postFillerPattern.remove(slotPattern.postFillerPattern.size() - 1);
+      setNeedsCompile(true);
+    }
+  }
+
+  public TextRulerRuleItem getRuleItemWithIndex(int index) {
+    int i = index;
+    if (i < slotPattern.preFillerPattern.size())
+      return slotPattern.preFillerPattern.get(i);
+    else
+      i -= slotPattern.preFillerPattern.size();
+
+    if (i < slotPattern.fillerPattern.size())
+      return slotPattern.fillerPattern.get(i);
+    else
+      i -= slotPattern.fillerPattern.size();
+
+    if (i < slotPattern.postFillerPattern.size())
+      return slotPattern.postFillerPattern.get(i);
+    else
+      return null;
+  }
+
+  public int totalItemCount() {
+    return slotPattern.preFillerPattern.size() + slotPattern.fillerPattern.size()
+            + slotPattern.postFillerPattern.size();
+  }
+
+  @Override
+  public TextRulerSingleSlotRule copy() {
+    return new TextRulerSingleSlotRule(this);
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,35 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ * 
+ * TextRulerSlotPattern holds the three slot patterns we usually have for a TextMarker-Rule: a
+ * preFiller pattern, a filler pattern and a post filler pattern
+ * 
+ */
+public class TextRulerSlotPattern {
+
+  public TextRulerRulePattern preFillerPattern = new TextRulerRulePattern();
+
+  public TextRulerRulePattern fillerPattern = new TextRulerRulePattern();
+
+  public TextRulerRulePattern postFillerPattern = new TextRulerRulePattern();
+
+  public TextRulerSlotPattern() {
+    super();
+  }
+
+  public TextRulerSlotPattern(TextRulerSlotPattern copyFrom) {
+    super();
+    for (TextRulerRuleItem i : copyFrom.preFillerPattern)
+      preFillerPattern.add(i.copy());
+    for (TextRulerRuleItem i : copyFrom.fillerPattern)
+      fillerPattern.add(i.copy());
+    for (TextRulerRuleItem i : copyFrom.postFillerPattern)
+      postFillerPattern.add(i.copy());
+  }
+
+  public TextRulerSlotPattern copy() {
+    return new TextRulerSlotPattern(this);
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,126 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * 
+ * TextRulerStatisticsCollector is used for collecting data while e.g. testing a rule.
+ * 
+ * It holds a HashMap for the covered positive and negative MLExamples of e.g. the training
+ * documents, counts the true positives (coveredPositives), false positives (coveredNegatives), ...
+ * 
+ * Currently false negatives (missedPositives) are not counted (see
+ * TextRulerExampleDocument.compareOriginalDocumentWithTestCAS) but this functionality can be easily
+ * added (since it exists as commented out code)
+ * 
+ */
+public class TextRulerStatisticsCollector {
+
+  protected int p = 0; // covered positive examples (true positives)
+
+  protected int n = 0; // covered negative examples (false positives)
+
+  // protected int missedPositives = 0; // (false negatives)
+
+  protected Set<TextRulerExample> coveredPositives = new HashSet<TextRulerExample>();
+
+  protected Set<TextRulerExample> coveredNegatives = new HashSet<TextRulerExample>();
+
+  public TextRulerStatisticsCollector() {
+    super();
+  }
+
+  public TextRulerStatisticsCollector(TextRulerStatisticsCollector c) {
+    p = c.p;
+    n = c.n;
+    coveredPositives.addAll(c.coveredPositives);
+    coveredNegatives.addAll(c.coveredNegatives);
+  }
+
+  public int getTotalCoveredExamples() {
+    return p + n;
+  }
+
+  public int getCoveredPositivesCount() {
+    if (TextRulerToolkit.DEBUG && p != coveredPositives.size()) {
+      TextRulerToolkit.log("WHY is P different from coveredPositives.size() ??");
+    }
+    return p;
+  }
+
+  public int getCoveredNegativesCount() {
+    return n;
+  }
+
+  public Set<TextRulerExample> getCoveredPositiveExamples() {
+    return coveredPositives;
+  }
+
+  public Set<TextRulerExample> getCoveredNegativeExamples() {
+    return coveredNegatives;
+  }
+
+  // public int getMissedPositives()
+  // {
+  // return missedPositives;
+  // }
+
+  public void reflectCountsFromCoveredExamples() {
+    p = coveredPositives.size();
+    n = coveredNegatives.size();
+  }
+
+  public void reset() {
+    p = 0;
+    n = 0;
+    // missedPositives = 0 ;
+    // coveredDocuments = 0;
+    coveredPositives.clear();
+    coveredNegatives.clear();
+  }
+
+  public void incCoveredPositives(int count) {
+    p += count;
+  }
+
+  public void incCoveredNegatives(int count) {
+    n += count;
+  }
+
+  public void addCoveredPositive(TextRulerExample e) {
+    if (coveredPositives.add(e))
+      incCoveredPositives(1);
+    else
+      TextRulerToolkit.logIfDebug("TRIED TO ADD A POSITIVE COVERED EXAMPLE TWICE !!");
+  }
+
+  public void addCoveredNegative(TextRulerExample e) {
+    if (coveredNegatives.add(e))
+      incCoveredNegatives(1);
+    else
+      TextRulerToolkit.logIfDebug("TRIED TO ADD A NEGATIVE COVERED EXAMPLE TWICE !!");
+  }
+
+  public TextRulerStatisticsCollector copy() {
+    return new TextRulerStatisticsCollector(this);
+  }
+
+  public void add(TextRulerStatisticsCollector c) {
+    incCoveredNegatives(c.n);
+    incCoveredPositives(c.p);
+    coveredPositives.addAll(c.coveredPositives);
+    coveredNegatives.addAll(c.coveredNegatives);
+  }
+
+  @Override
+  public String toString() {
+    return "p=" + p + "; n=" + n;
+  }
+
+  // public void incCoveredMissedPositives(int count)
+  // {
+  // missedPositives += count;
+  // }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java Mon Aug  1 14:21:12 2011
@@ -0,0 +1,160 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ * 
+ * TextRulerTarget encapsulates a learning target of an ML-algorithm. It currently can be
+ * 
+ * * multi slot (full slots, not boundary!) * single slot (full slot) * single boundary (single slot
+ * left or right boundary)
+ * 
+ * An TextRulerRule for example is induced for an TextRulerTarget or a testCAS for a given
+ * TextRulerExampleDocument is created especially for a given target.
+ * 
+ */
+public class TextRulerTarget {
+
+  public enum MLTargetType {
+    MULTI_SLOT, SINGLE_WHOLE_SLOT, SINGLE_LEFT_BOUNDARY, SINGLE_RIGHT_BOUNDARY, SINGLE_LEFT_CORRECTION, SINGLE_RIGHT_CORRECTION
+  };
+
+  public String slotNames[];
+
+  public MLTargetType type;
+
+  private TextRulerBasicLearner learner;
+
+  private int maxShiftDistance = 0;
+
+  // copy constructor
+  public TextRulerTarget(TextRulerTarget copyFrom, TextRulerBasicLearner owner) {
+    this.slotNames = copyFrom.slotNames.clone();
+    this.type = copyFrom.type;
+    this.learner = owner;
+  }
+
+  public TextRulerTarget(String slotNames[], TextRulerBasicLearner owner) {
+    this.slotNames = slotNames;
+    type = MLTargetType.MULTI_SLOT;
+    this.learner = owner;
+  }
+
+  public TextRulerTarget(String slotName, TextRulerBasicLearner owner) {
+    slotNames = new String[1];
+    slotNames[0] = slotName;
+    type = MLTargetType.SINGLE_WHOLE_SLOT;
+    this.learner = owner;
+  }
+
+  public TextRulerBasicLearner getLearner() {
+    return learner;
+  }
+
+  public TextRulerTarget(String slotName, MLTargetType type, TextRulerBasicLearner owner) {
+    slotNames = new String[1];
+    slotNames[0] = slotName;
+    this.type = type;
+    this.learner = owner;
+  }
+
+  public String getMultiSlotTypeName(int slotIndex) {
+    return slotNames[slotIndex];
+  }
+
+  public String getSingleSlotRawTypeName() {
+    return slotNames[0];
+  }
+
+  public String getSingleSlotTypeName() {
+    return getSingleSlotTypeName(type, slotNames[0]);
+    // if (type == MLTargetType.MULTI_SLOT)
+    // return null;
+    // if (type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+    // return slotNames[0]+TextRulerToolkit.LEFT_BOUNDARY_EXTENSION;
+    // else if (type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+    // return slotNames[0]+TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION;
+    // else
+    // return slotNames[0];
+  }
+
+  public static String getSingleSlotTypeName(MLTargetType t, String slotName) {
+    if (t == MLTargetType.MULTI_SLOT)
+      return null;
+    if (t == MLTargetType.SINGLE_LEFT_BOUNDARY)
+      return slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION;
+    else if (t == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+      return slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION;
+    else
+      return slotName;
+  }
+
+  public String[] getRawSlotNames() {
+    return slotNames;
+  }
+
+  public String[] getSlotTypeNames() {
+    if (type == MLTargetType.MULTI_SLOT)
+      return slotNames;
+    else {
+      String result[] = new String[1];
+      result[0] = getSingleSlotTypeName();
+      return result;
+    }
+  }
+
+  public boolean isMultiSlot() {
+    return type == MLTargetType.MULTI_SLOT;
+  }
+
+  public boolean isBoundary() {
+    return type == MLTargetType.SINGLE_LEFT_BOUNDARY || type == MLTargetType.SINGLE_RIGHT_BOUNDARY;
+  }
+
+  public boolean isLeftBoundary() {
+    return type == MLTargetType.SINGLE_LEFT_BOUNDARY;
+  }
+
+  public boolean isRightBoundary() {
+    return type == MLTargetType.SINGLE_RIGHT_BOUNDARY;
+  }
+
+  public boolean isLeftCorrection() {
+    return type == MLTargetType.SINGLE_LEFT_CORRECTION;
+  }
+
+  public boolean isRightCorrection() {
+    return type == MLTargetType.SINGLE_RIGHT_CORRECTION;
+  }
+
+  public TextRulerTarget getCounterPartBoundaryTarget() {
+    if (!isBoundary())
+      return null;
+    else if (type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+      return new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_RIGHT_BOUNDARY, learner);
+    else
+      return new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_LEFT_BOUNDARY, learner);
+  }
+
+  public int getMaxShiftDistance() {
+    return maxShiftDistance;
+  }
+
+  public void setMaxShiftDistance(int maxShiftDistance) {
+    this.maxShiftDistance = maxShiftDistance;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    TextRulerTarget t = (TextRulerTarget) o;
+    if (t.slotNames.length != slotNames.length)
+      return false;
+    for (int i = 0; i < slotNames.length; i++)
+      if (!slotNames[i].equals(t.slotNames[i]))
+        return false;
+    return type == t.type;
+  }
+
+  @Override
+  public int hashCode() {
+    return slotNames.hashCode() * type.hashCode();
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain