You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/12 12:32:54 UTC

svn commit: r1157037 [8/10] - in /uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler: ./ .settings/ META-INF/ icons/ schema/ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/uima/ src/main/jav...

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,670 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleList;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.learner.whisk.token.WhiskRuleItem.MLWhiskOtherConstraint;
+
+public class Whisk extends TextRulerBasicLearner {
+
+  public final static String WINDOSIZE_KEY = "windowSize";
+
+  public final static String ERROR_THRESHOLD_KEY = "errorThreshold";
+
+  public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";
+
+  public final static int STANDARD_WINDOWSIZE = 5;
+
+  public final static float STANDARD_ERROR_THRESHOLD = 0.1f;
+
+  public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";
+
+  TextRulerRuleList ruleList;
+
+  protected Set<TextRulerExample> coveredExamples;
+
+  protected int windowSize = STANDARD_WINDOWSIZE;
+
+  protected double errorThreshold = STANDARD_ERROR_THRESHOLD;
+
+  protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;
+
+  int roundNumber = 0;
+
+  int allExamplesCount = 0;
+
+  private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();
+
+  public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
+          Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+    super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, delegate);
+  }
+
+  @Override
+  public boolean collectNegativeCoveredInstancesWhenTesting() {
+    return false;
+  }
+
+  @Override
+  protected void doRun() {
+
+    // we don't use the same overall structure like the original WHISK since
+    // we do not
+    // repeat the whole process for some new training documents at the
+    // user's request, we
+    // learn like the other algorithms from the whole training set, so we
+    // for example do not
+    // need to test the intermediate rule base on a newly "incoming"
+    // training document since we
+    // tested all rules already on all training documents !
+
+    // this version of whisk is not tested for mutli slot learning since the
+    // seminar announcements
+    // are not quite suitable for this task: they do not all contain all 4
+    // slots and some of them
+    // occur more than once in one document ! And the order of them is not
+    // always the same as well!
+    // so this is now made only tested for the single slot case even if it
+    // is built capable of multislot
+    // examples!
+
+    // this is the inner loop of the WHISK pseudo-code:
+    // For each inst in Training
+    // for each tag
+
+    cachedTestedRuleStatistics.clear();
+    ruleList = new TextRulerRuleList();
+    coveredExamples = new HashSet<TextRulerExample>();
+
+    sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
+    TextRulerTarget target = new TextRulerTarget(slotNames[0], this); // only
+    // single-slot-target
+    // for now
+    exampleDocuments.createExamplesForTarget(target);
+
+    TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();
+
+    allExamplesCount = exampleDocuments.getAllPositiveExamples().size();
+
+    for (TextRulerExampleDocument inst : docs) {
+      List<TextRulerExample> tags = inst.getPositiveExamples();
+
+      // for each uncovered example -> induce a new rule:
+      for (TextRulerExample tag : tags) {
+        if (!coveredExamples.contains(tag)) {
+          roundNumber++;
+          WhiskRule newRule = growRule(inst, tag);
+          if (shouldAbort())
+            break;
+          // if (newRule == null)
+          // break;
+          // else
+          if (newRule != null
+                  && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
+                          .getLaplacian() <= errorThreshold)) {
+            ruleList.addRule(newRule);
+            coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
+            sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING, true);
+          }
+        }
+      }
+      if (shouldAbort())
+        return;
+    }
+    sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+    cachedTestedRuleStatistics.clear();
+  }
+
+  protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
+    sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
+            false);
+    WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
+    int numberOfSlotsInTag = example.getAnnotations().length;
+    for (int i = 0; i < numberOfSlotsInTag; i++)
+      theRule.getPatterns().add(new TextRulerSlotPattern());
+
+    List<WhiskRuleItem> allTerms = getAllTermsOfExample(example);
+
+    sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
+            false);
+    for (int i = 0; i < numberOfSlotsInTag; i++) {
+      theRule = anchor(theRule, doc, example, allTerms, i);
+      if (shouldAbort())
+        return null;
+    }
+
+    sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
+            false);
+    if (theRule != null) {
+      double oldLaplacian = theRule.getLaplacian();
+      int subRoundNumber = 0;
+      // repeat while we still make errors...
+      while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
+        WhiskRule extendedRule = extendRule(theRule, doc, example, allTerms, subRoundNumber);
+        if (extendedRule == null) {
+          // this way we get the previous rule
+          // as the best rule...
+          break;
+        }
+        theRule = extendedRule;
+        TextRulerToolkit.log("----------------------------");
+        TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
+        TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + "    ; "
+                + theRule.getCoveringStatistics());
+        subRoundNumber++;
+
+        double newLaplacian = theRule.getLaplacian();
+        if (newLaplacian >= oldLaplacian) {
+          break;
+        }
+        oldLaplacian = newLaplacian;
+      }
+      TextRulerToolkit.log("----------------------------");
+      TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
+    }
+    return theRule;
+  }
+
+  protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
+          TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
+    WhiskRule bestRule = null;
+    double bestL = 1.0;
+    int bestRuleConstraintPoints = -1;
+    if (rule.getLaplacian() <= errorThreshold) {
+      bestRule = rule;
+      bestL = rule.getLaplacian();
+    }
+
+    List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
+            example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
+    WhiskRuleItem firstSlotTerm = slotTerms.get(0);
+    WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);
+
+    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
+    for (WhiskRuleItem term : allTerms) {
+      if (rule.containsTerm(term)) {
+        continue;
+      }
+
+      boolean rejectTerm = false;
+      // for now this works only for slot 0 (no multislot stuff here yet!)
+      if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
+        rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
+      else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
+        rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;
+
+      if (rejectTerm) {
+        // out of window scope -> skip to next...
+        continue;
+      }
+
+      WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
+      WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());
+
+      if (!rulesToTest.contains(proposedRule))
+        rulesToTest.add(proposedRule);
+
+      // add a second version where we remove the exact token content if
+      // it is a regexp item:
+      WhiskRule proposedRule2 = null;
+      WhiskRuleItem t2 = null;
+      if (t.getWordConstraint().isRegExpConstraint()) {
+        proposedRule2 = proposedRule.copy();
+        t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
+        t2.setHideRegExp(true);
+        proposedRule2.setNeedsCompile(true);
+        if (!rulesToTest.contains(proposedRule2)) {
+          rulesToTest.add(proposedRule2);
+        }
+      }
+
+      // and now, for WHISK performance testing purposes, we also add POS
+      // tags:
+      // this is not very nice code and not dynamic feature capable, but
+      // for testpurposes
+      // in order to test WHISK with PosTag Terms...
+      if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
+        TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
+        CAS cas = example.getDocumentCAS();
+        TypeSystem ts = cas.getTypeSystem();
+        Type posTagsRootType = ts.getType(posTagRootTypeName);
+        if (ts != null) {
+          // POS-Tags created by our test hmm tagger.
+          List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
+                  tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
+          if (posTagAnnotations.size() > 0) {
+            AnnotationFS posTag = posTagAnnotations.get(0);
+            if (posTag.getBegin() == tokenAnnotation.getBegin()
+                    && posTag.getEnd() == tokenAnnotation.getEnd()) {
+              TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);
+
+              // 1. most specific term with all constraints we
+              // have:
+              WhiskRule proposedRule3 = proposedRule.copy();
+              WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
+                      .getTermNumberInExample());
+              t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+              proposedRule3.setNeedsCompile(true);
+              if (!rulesToTest.contains(proposedRule3))
+                rulesToTest.add(proposedRule3);
+
+              // 2. the same without the regexp thingy:
+              if (proposedRule2 != null) {
+                WhiskRule proposedRule4 = proposedRule2.copy();
+                WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
+                        .getTermNumberInExample());
+                t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+                proposedRule4.setNeedsCompile(true);
+                if (!rulesToTest.contains(proposedRule4))
+                  rulesToTest.add(proposedRule4);
+              }
+
+              // 3. last but not least: a rule with only the pos
+              // tag constraint:
+              WhiskRule proposedRule5 = proposedRule.copy();
+              WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
+                      .getTermNumberInExample());
+              t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+              t5.setWordConstraint(null);
+              proposedRule5.setNeedsCompile(true);
+              if (!rulesToTest.contains(proposedRule5))
+                rulesToTest.add(proposedRule5);
+            }
+          }
+        }
+      }
+
+    }
+    if (rulesToTest.size() == 0)
+      return bestRule;
+
+    sendStatusUpdateToDelegate(
+            "Round "
+                    + roundNumber
+                    + "."
+                    + subRoundNumber
+                    + " - Testing "
+                    + rulesToTest.size()
+                    + " rules... "
+                    + " - uncovered examples: "
+                    + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+                            + " ; cs=" + cachedTestedRuleStatistics.size()),
+            TextRulerLearnerState.ML_RUNNING, false);
+
+    TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
+    for (TextRulerRule r : rulesToTest)
+      TextRulerToolkit.log(r.getRuleString());
+    testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
+    // exampleDocuments);
+    if (shouldAbort())
+      return null;
+    for (TextRulerRule r : rulesToTest) {
+      WhiskRule wr = (WhiskRule) r;
+      if (wr.getLaplacian() < bestL) {
+        bestL = wr.getLaplacian();
+        bestRule = wr;
+        bestRuleConstraintPoints = bestRule.totalConstraintPoints();
+      } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
+        TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
+        if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
+          TextRulerToolkit.log("\tYes, prefered!");
+          bestL = wr.getLaplacian();
+          bestRule = wr;
+          bestRuleConstraintPoints = bestRule.totalConstraintPoints();
+        }
+      }
+    }
+    return bestRule;
+  }
+
+  protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
+    WhiskRule newRule = baseRule.copy();
+    int foundSlotNumber = -1; // debug info
+    String foundSlotPattern = "";
+    int termNumber = term.getTermNumberInExample();
+    // determine, where this term is located relatively to the slots we
+    // have...
+    TextRulerRulePattern targetPattern = null;
+    TextRulerRulePattern previousSlotPostFillerPattern = null;
+    for (int i = 0; i < newRule.getPatterns().size(); i++) {
+      TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
+      WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
+      // prefiller
+      // pattern
+      if (it != null && termNumber <= it.getTermNumberInExample())
+        targetPattern = slotPattern.preFillerPattern;
+      if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
+      // look
+      // at
+      // the
+      // filler
+      // pattern
+      {
+        it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
+        if (termNumber < it.getTermNumberInExample()) // it's still for
+          // the prefiller
+          // pattern but it
+          // seems to be
+          // emtpy so we
+          // could not find
+          // that out above!
+          targetPattern = slotPattern.preFillerPattern;
+        else {
+          it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
+          if (termNumber <= it.getTermNumberInExample()) {
+            targetPattern = slotPattern.fillerPattern;
+          }
+        }
+      }
+      if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
+      // the
+      // postfiller
+      // pattern
+      {
+        it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
+        if (termNumber < it.getTermNumberInExample()) // it's still for
+          // the filler
+          // pattern but it
+          // seems to be
+          // emtpy so we
+          // could not find
+          // that out above!
+          targetPattern = slotPattern.fillerPattern;
+        else {
+          it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
+          if (termNumber <= it.getTermNumberInExample())
+            targetPattern = slotPattern.postFillerPattern;
+        }
+      }
+      if (targetPattern == null) {
+        targetPattern = previousSlotPostFillerPattern;
+        if (i > 0) {
+          TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
+          foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
+                  : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
+          foundSlotNumber = i - 1;
+        }
+      } else {
+        foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
+                : (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
+        foundSlotNumber = i;
+      }
+      previousSlotPostFillerPattern = slotPattern.postFillerPattern;
+    }
+
+    if (targetPattern == null) {
+      targetPattern = previousSlotPostFillerPattern;
+      foundSlotNumber = newRule.getPatterns().size() - 1;
+      foundSlotPattern = "POST FILLER";
+    }
+
+    if (targetPattern == null) {
+      TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
+    } else {
+      // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
+      // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
+      // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
+      // now put that term into the rule:
+      int indexInPattern = -1;
+      if (targetPattern.size() == 0) {
+        targetPattern.add(term.copy());
+        indexInPattern = 0;
+      } else {
+        // 1. search if the term would replace a wildcard:
+        WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
+        if (wildCard != null) {
+          if (!wildCard.isStarWildCard()) {
+            TextRulerToolkit
+                    .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
+            return null;
+          }
+          if (!targetPattern.contains(wildCard)) {
+            TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
+            return null;
+          }
+          indexInPattern = targetPattern.indexOf(wildCard);
+          targetPattern.set(indexInPattern, term.copy());
+        } else {
+          // not a wildcard, so search for the insertion point:
+          for (int i = 0; i < targetPattern.size(); i++) {
+            WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
+            if (termNumber < it.getTermNumberInExample()) {
+              indexInPattern = i;
+              break;
+            }
+          }
+          if (indexInPattern < 0) {
+            indexInPattern = targetPattern.size();
+            targetPattern.add(term.copy());
+          } else
+            targetPattern.add(indexInPattern, term.copy());
+        }
+      }
+      // ok, now we have replaced a wildcard with the term or added the
+      // term between two other items.
+      // we now have to check the neighbors of the new term: if it is a
+      // direct neighbor (according to the termNumber),
+      // we have nothing special to do. but if it is not a direct
+      // neighbor, we have to add a wildcard between the two items (if the
+      // neighbor item
+      // is not a wildcard itself!
+      WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);
+
+      // look at left neighbor:
+      WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
+      if (left != null) {
+        // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");
+
+        // so we have a left neighbor. let's see if it also is the
+        // neighbor in our seed token stream:
+        if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
+                && !left.isStarWildCard()) { // no direct neighbor and
+          // no wildcard yet,
+          // so insert a wildcard between us!
+          targetPattern.add(indexInPattern,
+                  WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
+          indexInPattern++;
+        }
+      }
+
+      // look at right neighbor:
+      WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
+      if (right != null) {
+        // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
+        // so we have a right neighbor. let's see if it also is the
+        // neighbor in our seed token stream:
+        if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
+                && !right.isStarWildCard()) { // no direct neighbor and
+          // no wildcard yet,
+          // so insert a wildcard between us!
+          WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
+          if (indexInPattern + 1 < targetPattern.size())
+            targetPattern.add(indexInPattern + 1, wc);
+          else
+            targetPattern.add(wc);
+        }
+      }
+
+      newRule.setNeedsCompile(true);
+      // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
+      // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
+      // TextRulerToolkit.log("");
+    }
+    if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
+      // must
+      // not be!
+      return null;
+    else
+      return newRule;
+  }
+
+  protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
+          TextRulerExample example, List<WhiskRuleItem> allTerms, int slotIndex) {
+    TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
+    List<WhiskRuleItem> inside = getTermsWithinBounds(allTerms, slotAnnotation.getBegin(),
+            slotAnnotation.getEnd());
+
+    if (rule == null || inside.isEmpty()) {
+      return null;
+    }
+    // create base 1 and base 2:
+    WhiskRule base1 = rule.copy(); // slot filler rule
+    TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
+    for (int i = 0; i < inside.size(); i++)
+      if (i == 0 || (i == inside.size() - 1))
+        slotPattern.fillerPattern.add(inside.get(i).copy());
+      else if (inside.size() > 2 && i < 2)
+        slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(i)
+                .getTermNumberInExample()));
+
+    WhiskRule base2 = rule.copy(); // slot context rule
+    slotPattern = base2.getPatterns().get(slotIndex);
+
+    int firstOfSlot = allTerms.indexOf(inside.get(0));
+    int lastOfSlot = allTerms.indexOf(inside.get(inside.size() - 1));
+    if (firstOfSlot > 0)
+      slotPattern.preFillerPattern.add(allTerms.get(firstOfSlot - 1));
+    slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(0)
+            .getTermNumberInExample()));
+    if (lastOfSlot + 1 < allTerms.size())
+      slotPattern.postFillerPattern.add(allTerms.get(lastOfSlot + 1));
+
+    TextRulerToolkit.log("base1: " + base1.getRuleString());
+    TextRulerToolkit.log("base2: " + base2.getRuleString());
+    List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
+    testRules.add(base1);
+    testRules.add(base2);
+    // testRulesOnDocumentSet(testRules, exampleDocuments);
+    testRulesIfNotCached(testRules);
+    if (shouldAbort())
+      return null;
+    TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
+            + base1.getLaplacian());
+    TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
+            + base2.getLaplacian());
+    if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1.getCoveringStatistics()
+            .getCoveredPositivesCount())
+      return base2;
+    else
+      return base1;
+  }
+
+  public String getResultString() {
+    if (ruleList != null)
+      return getTMFileHeaderString() + ruleList.getRulesString("");
+    else
+      return "No results available yet!";
+  }
+
+  public void setParameters(Map<String, Object> params) {
+    if (TextRulerToolkit.DEBUG)
+      saveParametersToTempFolder(params);
+
+    // TODO try catch
+    if (params.containsKey(WINDOSIZE_KEY))
+      windowSize = (Integer) params.get(WINDOSIZE_KEY);
+
+    if (params.containsKey(ERROR_THRESHOLD_KEY))
+      errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);
+
+    if (params.containsKey(POSTAG_ROOTTYPE_KEY))
+      posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);
+
+  }
+
+  public List<WhiskRuleItem> getAllTermsOfExample(TextRulerExample example) {
+    CAS cas = example.getDocumentCAS();
+    Type tokensRootType = cas.getTypeSystem().getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+    List<AnnotationFS> all = TextRulerToolkit.getAnnotationsWithinBounds(cas, 0, cas
+            .getDocumentText().length() + 1, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+            filterSet), tokensRootType);
+
+    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
+    int i = 0;
+    for (AnnotationFS afs : all) {
+      WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(afs, example.getDocument()));
+      term.setTermNumberInExample(i);
+      i++;
+      result.add(term);
+    }
+    return result;
+  }
+
+  public List<WhiskRuleItem> getTermsWithinBounds(List<WhiskRuleItem> allTerms, int startPos,
+          int endPos) {
+    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
+    for (WhiskRuleItem term : allTerms) {
+      TextRulerAnnotation a = term.getWordConstraint().getTokenAnnotation();
+      if (a.getBegin() >= startPos && a.getEnd() <= endPos)
+        result.add(term);
+      if (a.getEnd() > endPos)
+        break;
+    }
+    return result;
+  }
+
+  // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
+  // make a maximum size of the cache, etc. like CasCache?
+  protected void testRulesIfNotCached(List<TextRulerRule> rules) {
+    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
+
+    for (TextRulerRule r : rules) {
+      String key = r.getRuleString();
+      if (cachedTestedRuleStatistics.containsKey(key)) {
+        r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
+        TextRulerToolkit.log("CACHE HIT !");
+      } else
+        rulesToTest.add(r);
+    }
+
+    if (rulesToTest.size() > 0) {
+      testRulesOnDocumentSet(rulesToTest, exampleDocuments);
+      if (shouldAbort())
+        return;
+      for (TextRulerRule r : rulesToTest) {
+        String key = r.getRuleString();
+        cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
+      }
+    }
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearner;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter.MLAlgorithmParamType;
+
+public class WhiskFactory implements TextRulerLearnerFactory {
+
+  public TextRulerLearner createAlgorithm(String inputFolderPath, String additionalFolderPath,
+          String preprocessorTMfile, String tempFolderPath, String[] fullSlotTypeNames,
+          Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+    return new Whisk(inputFolderPath, preprocessorTMfile, tempFolderPath, fullSlotTypeNames,
+            filterSet, delegate);
+  }
+
+  public Map<String, Object> getAlgorithmParameterStandardValues() {
+    Map<String, Object> result = new HashMap<String, Object>();
+    result.put(Whisk.WINDOSIZE_KEY, Whisk.STANDARD_WINDOWSIZE);
+    result.put(Whisk.ERROR_THRESHOLD_KEY, Whisk.STANDARD_ERROR_THRESHOLD);
+    result.put(Whisk.POSTAG_ROOTTYPE_KEY, Whisk.STANDARD_POSTAG_ROOTTYPE);
+    return result;
+  }
+
+  public TextRulerLearnerParameter[] getAlgorithmParameters() {
+    TextRulerLearnerParameter[] result = new TextRulerLearnerParameter[3];
+
+    result[0] = new TextRulerLearnerParameter(Whisk.WINDOSIZE_KEY, "Window Size",
+            MLAlgorithmParamType.ML_INT_PARAM);
+    result[1] = new TextRulerLearnerParameter(Whisk.ERROR_THRESHOLD_KEY, "Maximum Error Threshold",
+            MLAlgorithmParamType.ML_FLOAT_PARAM);
+    result[2] = new TextRulerLearnerParameter(Whisk.POSTAG_ROOTTYPE_KEY, "PosTag Root Type",
+            MLAlgorithmParamType.ML_STRING_PARAM);
+
+    return result;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerMultiSlotRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+
+public class WhiskRule extends TextRulerMultiSlotRule {
+
+  TextRulerExample seedExample;
+
+  public WhiskRule(WhiskRule copyFrom) {
+    super(copyFrom);
+    seedExample = copyFrom.seedExample;
+  }
+
+  public WhiskRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target,
+          TextRulerExample seedExample) {
+    super(parentAlgorithm, target);
+    this.seedExample = seedExample;
+  }
+
+  @Override
+  public WhiskRule copy() {
+    return new WhiskRule(this);
+  }
+
+  public double getLaplacian() {
+    int e = 0;
+    int n = 0;
+
+    if (coveringStatistics != null) {
+      e = coveringStatistics.getCoveredNegativesCount();
+      n = coveringStatistics.getCoveredNegativesCount()
+              + coveringStatistics.getCoveredPositivesCount();
+    }
+    return ((double) e + 1) / ((double) n + 1);
+  }
+
+  public TextRulerExample getSeedExample() {
+    return seedExample;
+  }
+
+  @Override
+  public void setCoveringStatistics(TextRulerStatisticsCollector c) {
+    super.setCoveringStatistics(c);
+    if (TextRulerToolkit.DEBUG && c != null) {
+      if (!c.getCoveredPositiveExamples().contains(seedExample)) {
+        TextRulerToolkit.log("ERROR, A WHISK RULE MUST COVER AT LEAST ITS SEED EXAMPLE!");
+        TextRulerToolkit.log("\tRULE: " + getRuleString());
+      }
+    }
+  }
+
+  public boolean containsTerm(WhiskRuleItem term) {
+    for (TextRulerSlotPattern sp : slotPatterns) {
+      for (TextRulerRuleItem i : sp.preFillerPattern)
+        if (i.equals(term))
+          return true;
+      for (TextRulerRuleItem i : sp.fillerPattern)
+        if (i.equals(term))
+          return true;
+      for (TextRulerRuleItem i : sp.postFillerPattern)
+        if (i.equals(term))
+          return true;
+    }
+    return false;
+  }
+
+  public WhiskRuleItem searchItemWithTermNumber(int no) {
+    for (TextRulerSlotPattern sp : slotPatterns) {
+      for (TextRulerRuleItem i : sp.preFillerPattern) {
+        if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+          return (WhiskRuleItem) i;
+        }
+      }
+      for (TextRulerRuleItem i : sp.fillerPattern) {
+        if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+          return (WhiskRuleItem) i;
+        }
+      }
+      for (TextRulerRuleItem i : sp.postFillerPattern) {
+        if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+          return (WhiskRuleItem) i;
+        }
+      }
+    }
+    return null;
+  }
+
+  // TODO this could be moved to the core framework (TextRulerMultiSlotRule)
+  public WhiskRuleItem searchNeighborOfItem(WhiskRuleItem item, boolean goToLeft) {
+    int slotIndex = -1;
+    int patternIndex = -1;
+    int slotI = 0;
+    for (TextRulerSlotPattern sp : slotPatterns) {
+      for (TextRulerRuleItem it : sp.preFillerPattern) {
+        if (it == item) {
+          slotIndex = slotI;
+          patternIndex = 0; // 0=preFiller
+          break;
+        }
+      }
+      if (slotIndex < 0) {
+        for (TextRulerRuleItem it : sp.fillerPattern) {
+          if (it == item) {
+            slotIndex = slotI;
+            patternIndex = 1; // 1=filler
+            break;
+          }
+        }
+      }
+      if (slotIndex < 0) {
+        for (TextRulerRuleItem it : sp.postFillerPattern) {
+          if (it == item) {
+            slotIndex = slotI;
+            patternIndex = 2; // 2=postFiller
+            break;
+          }
+        }
+      }
+      if (slotIndex >= 0) {
+        break;
+      }
+    }
+    if (slotIndex < 0) // we didn't even find the item in our rule ?! how
+      // can this happen ?
+      return null;
+
+    TextRulerRulePattern currentPattern = getPattern(slotIndex, patternIndex);
+    while (currentPattern != null) {
+      int startIndex = currentPattern.indexOf(item); // this is only >= 0
+      // for the first
+      // pattern...
+      if (!goToLeft) // walk forward...
+      {
+        int startSearchFromIndex = startIndex + 1;
+        if (startSearchFromIndex < currentPattern.size())
+          return (WhiskRuleItem) currentPattern.get(startSearchFromIndex);
+        else // skip to next pattern
+        {
+          patternIndex++;
+          if (patternIndex > 2) {
+            patternIndex = 0;
+            slotIndex++;
+            if (slotIndex >= slotPatterns.size())
+              return null; // not found!
+          }
+          currentPattern = getPattern(slotIndex, patternIndex);
+        }
+      } else {
+        int startSearchFromIndex = startIndex >= 0 ? startIndex - 1 : currentPattern.size() - 1;
+        if (startSearchFromIndex >= 0 && currentPattern.size() > 0)
+          return (WhiskRuleItem) currentPattern.get(startSearchFromIndex);
+        else // skip to previous pattern
+        {
+          patternIndex--;
+          if (patternIndex < 0) {
+            patternIndex = 2;
+            slotIndex--;
+            if (slotIndex < 0)
+              return null; // not found!
+          }
+          currentPattern = getPattern(slotIndex, patternIndex);
+        }
+      }
+    }
+    return null;
+  }
+
+  private TextRulerRulePattern getPattern(int slotIndex, int patternIndex) {
+    TextRulerSlotPattern sp = slotPatterns.get(slotIndex);
+    if (patternIndex == 0)
+      return sp.preFillerPattern;
+    else if (patternIndex == 1)
+      return sp.fillerPattern;
+    else if (patternIndex == 2)
+      return sp.postFillerPattern;
+    else
+      return null;
+  }
+
+  public int totalConstraintPoints() {
+    int result = 0;
+    for (TextRulerSlotPattern sl : slotPatterns) {
+      for (TextRulerRuleItem i : sl.preFillerPattern) {
+        result += ((WhiskRuleItem) i).constraintPoints();
+      }
+      for (TextRulerRuleItem i : sl.fillerPattern) {
+        result += ((WhiskRuleItem) i).constraintPoints();
+      }
+      for (TextRulerRuleItem i : sl.postFillerPattern) {
+        result += ((WhiskRuleItem) i).constraintPoints();
+      }
+    }
+    return result;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.Type;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerWordConstraint;
+
+public class WhiskRuleItem implements TextRulerRuleItem {
+
+  private TextRulerWordConstraint wordConstraint;
+
+  private boolean isStarWildCard = false;
+
+  private int termNumberInExample = -1;
+
+  private boolean hideRegExp = false;
+
+  protected List<MLWhiskOtherConstraint> otherConstraints = new ArrayList<MLWhiskOtherConstraint>();
+
+  public static class MLWhiskOtherConstraint {
+
+    TextRulerAnnotation tokenAnnotation;
+
+    TextRulerAnnotation constraintAnnotation;
+
+    boolean canBeAnchor;
+
+    Type type;
+
+    public MLWhiskOtherConstraint(TextRulerAnnotation tokenAnnotation,
+            TextRulerAnnotation constraintAnnotation) {
+      this.tokenAnnotation = tokenAnnotation;
+      this.constraintAnnotation = constraintAnnotation;
+      this.type = constraintAnnotation.getType();
+      canBeAnchor = (tokenAnnotation.getBegin() == constraintAnnotation.getBegin())
+              && (tokenAnnotation.getEnd() == constraintAnnotation.getEnd());
+      // TODO is the matching END also a requirement ?
+    }
+
+    public boolean isTMBasicTypeTokenConstraint() {
+      return tokenAnnotation == constraintAnnotation;
+    }
+
+    public boolean canBeAnchorConstraint() {
+      return canBeAnchor;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      MLWhiskOtherConstraint co = (MLWhiskOtherConstraint) o;
+
+      return toString().equals(co.toString()) && (canBeAnchor == co.canBeAnchor);
+    }
+
+    @Override
+    public int hashCode() {
+      return toString().hashCode() * (canBeAnchor ? 2 : 1);
+    }
+
+    @Override
+    public String toString() {
+      return type.getShortName();
+    }
+
+    public MLWhiskOtherConstraint copy() {
+      return new MLWhiskOtherConstraint(tokenAnnotation, constraintAnnotation);
+    }
+
+  }
+
+  public WhiskRuleItem() {
+    super();
+    wordConstraint = null;
+    termNumberInExample = -1;
+  }
+
+  public static WhiskRuleItem newWildCardItem(int startTermNumber) {
+    WhiskRuleItem i = new WhiskRuleItem();
+    i.setIsStarWildCard(true);
+    i.setTermNumberInExample(startTermNumber);
+    return i;
+  }
+
+  public WhiskRuleItem(WhiskRuleItem copyFrom) {
+    super();
+    if (copyFrom.wordConstraint != null)
+      wordConstraint = copyFrom.wordConstraint.copy();
+    isStarWildCard = copyFrom.isStarWildCard;
+    termNumberInExample = copyFrom.termNumberInExample;
+    hideRegExp = copyFrom.hideRegExp;
+    for (MLWhiskOtherConstraint c : copyFrom.otherConstraints)
+      otherConstraints.add(c.copy());
+  }
+
+  public WhiskRuleItem(TextRulerAnnotation tokenAnnotation) {
+    super();
+    setWordConstraint(new TextRulerWordConstraint(tokenAnnotation));
+  }
+
+  public void setWordConstraint(TextRulerWordConstraint c) {
+    wordConstraint = c;
+  }
+
+  public TextRulerWordConstraint getWordConstraint() {
+    return wordConstraint;
+  }
+
+  public TextRulerRuleItem copy() {
+    return new WhiskRuleItem(this);
+  }
+
+  public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+          int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex) {
+
+    String result = "";
+    WhiskRule whiskRule = (WhiskRule) rule;
+    boolean isMarkingItem = type == MLRuleItemType.FILLER && numberInPattern == 0;
+    ArrayList<String> constraints = new ArrayList<String>();
+
+    String anchor = null;
+
+    if (wordConstraint != null) {
+      if (wordConstraint.isRegExpConstraint()) {
+        anchor = wordConstraint.typeShortName();
+        if (!hideRegExp)
+          constraints.add("REGEXP(\"" + wordConstraint + "\")");
+      } else
+        anchor = wordConstraint.toString();
+    }
+
+    MLWhiskOtherConstraint anchorConstraint = null;
+    if (anchor == null) {
+      for (MLWhiskOtherConstraint c : otherConstraints)
+        if (c.canBeAnchorConstraint()) {
+          anchorConstraint = c;
+          break;
+        }
+    }
+
+    for (MLWhiskOtherConstraint oc : otherConstraints) {
+      if (oc != anchorConstraint) {
+        if (oc.canBeAnchorConstraint())
+          constraints.add("IS(" + oc + ")");
+        else
+          constraints.add("PARTOF(" + oc + ")");
+      }
+    }
+    if (anchor == null) {
+      if (anchorConstraint != null)
+        anchor = anchorConstraint.toString();
+      else
+        anchor = "ALL";
+    }
+
+    if (constraints.size() > 0) {
+      String cStr = "";
+      for (String constraintStr : constraints) {
+        if (cStr.length() > 0)
+          cStr += ", ";
+        cStr += constraintStr;
+      }
+      result += "{" + cStr;
+      if (!isMarkingItem)
+        result += "}";
+    }
+
+    if (isMarkingItem) {
+      if (constraints.size() == 0)
+        result += "{";
+      result += "->MARKONCE(" + whiskRule.getMarkName(slotIndex);
+      if (patternSize > 1)
+        result += ", " + (numberInRule + 1) + ", " + (numberInRule + patternSize);
+      result += ")}";
+    }
+    if (isStarWildCard)
+      anchor += "*?";
+    return anchor + result;
+  }
+
+  public void setIsStarWildCard(boolean flag) {
+    isStarWildCard = flag;
+  }
+
+  public boolean isStarWildCard() {
+    return isStarWildCard;
+  }
+
+  public void setTermNumberInExample(int i) {
+    termNumberInExample = i;
+  }
+
+  public int getTermNumberInExample() {
+    return termNumberInExample;
+  }
+
+  public boolean equals(TextRulerRuleItem o) {
+    WhiskRuleItem it = (WhiskRuleItem) o;
+    if (wordConstraint != null)
+      if (!wordConstraint.equals(it.wordConstraint))
+        return false;
+
+    return isStarWildCard == it.isStarWildCard && termNumberInExample == it.termNumberInExample;
+  }
+
+  @Override
+  public String toString() {
+    return getStringForRuleString(null, null, 0, 0, 0, 0, 0);
+  }
+
+  public void setHideRegExp(boolean flag) {
+    hideRegExp = flag;
+  }
+
+  public void addOtherConstraint(MLWhiskOtherConstraint c) {
+    if (!otherConstraints.contains(c))
+      otherConstraints.add(c);
+  }
+
+  public List<MLWhiskOtherConstraint> getOtherConstraints() {
+    return otherConstraints;
+  }
+
+  public int constraintPoints() {
+    int result = 0;
+    if (wordConstraint != null)
+      result += hideRegExp ? 1 : 3; // a regexp constraint is less general
+    // so point it bad here!
+    result += otherConstraints.size();
+    return result;
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.uima.textmarker.textruler.TextRulerPlugin;
+import org.apache.uima.textmarker.textruler.extension.TextRulerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.eclipse.jface.preference.BooleanFieldEditor;
+import org.eclipse.jface.preference.FieldEditor;
+import org.eclipse.jface.preference.IPreferenceStore;
+import org.eclipse.jface.preference.PreferencePage;
+import org.eclipse.jface.preference.StringFieldEditor;
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Composite;
+import org.eclipse.swt.widgets.Control;
+import org.eclipse.ui.IWorkbench;
+import org.eclipse.ui.IWorkbenchPreferencePage;
+
+public class WhiskTokenPreferencePage extends PreferencePage implements IWorkbenchPreferencePage {
+
+  public static String ID = "org.apache.uima.textmarker.textruler.algorithmPages";
+
+  private TextRulerLearnerController algorithmController;
+
+  private IPreferenceStore store;
+
+  private ArrayList<FieldEditor> fields = new ArrayList<FieldEditor>();
+
+  public WhiskTokenPreferencePage() {
+    TextRulerLearnerController ctrl = TextRulerController
+            .getControllerForID("org.apache.uima.textmarker.textruler.whisk.token");
+    this.algorithmController = ctrl;
+    store = TextRulerPlugin.getDefault().getPreferenceStore();
+    setPreferenceStore(store);
+  }
+
+  @Override
+  protected Control createContents(Composite parent) {
+    Composite top = new Composite(parent, SWT.LEFT);
+    top.setLayoutData(new GridData(GridData.FILL_HORIZONTAL));
+    top.setLayout(new GridLayout());
+
+    TextRulerLearnerFactory f = algorithmController.getFactory();
+    TextRulerLearnerParameter[] params = f.getAlgorithmParameters();
+    Map<String, Object> values = f.getAlgorithmParameterStandardValues();
+    if (params != null) {
+      for (int i = 0; i < params.length; i++) {
+        TextRulerLearnerParameter p = params[i];
+        String id = algorithmController.getID() + "." + p.id;
+        FieldEditor l = null;
+        switch (p.type) {
+          case ML_BOOL_PARAM: {
+            l = new BooleanFieldEditor(id, p.name, top);
+            fields.add(l);
+            store.setDefault(id, (Boolean) values.get(p.id));
+            l.setPreferenceStore(store);
+            l.load();
+            break;
+          }
+
+          case ML_FLOAT_PARAM:
+          case ML_INT_PARAM:
+          case ML_STRING_PARAM: {
+            l = new StringFieldEditor(id, p.name, top);
+            fields.add(l);
+            store.setDefault(id, values.get(p.id).toString());
+            l.setPreferenceStore(store);
+            l.load();
+            break;
+          }
+          case ML_SELECT_PARAM:
+            break;
+        }
+      }
+    }
+    return top;
+  }
+
+  @Override
+  public void init(IWorkbench workbench) {
+  }
+
+  @Override
+  protected void performDefaults() {
+    for (FieldEditor f : fields)
+      f.loadDefault();
+    // super.performDefaults();
+  }
+
+  @Override
+  public boolean performOk() {
+    for (FieldEditor f : fields)
+      f.store();
+    // return super.performOk();
+    return true;
+  }
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,730 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.wien;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+
+public class Wien extends TextRulerBasicLearner {
+
+  TextRulerRulePattern hPattern;
+
+  TextRulerRulePattern tPattern;
+
+  Map<String, PatternPair> headTailCache = new HashMap<String, PatternPair>();
+
+  Map<String, List<TextRulerRulePattern>> interTupelSeparatorsCache = new HashMap<String, List<TextRulerRulePattern>>();
+
+  public static class PatternPair {
+    public TextRulerRulePattern l = new TextRulerRulePattern();
+
+    public TextRulerRulePattern r = new TextRulerRulePattern();
+  }
+
+  ArrayList<PatternPair> patternPairs = new ArrayList<PatternPair>();
+
+  WienRule theRule;
+
+  public enum constraint3ReturnType {
+    C3_SUCCESS, C3_L1CandidateSuffixError, C3_TailCandidateH_L1Error, C3_TailCandidateRK_PrefixError, C3_TailCandidateNotFoundError, C3_TailCandidateSucceedsL1InTailError, C3_L1CandidateInterTupleSeparatorSuffixError, C3_TailCandidatePrecedesL1InterTupleSeparatorError
+  };
+
+  public Wien(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
+          Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+    super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, delegate);
+  }
+
+  @Override
+  public boolean collectNegativeCoveredInstancesWhenTesting() {
+    return false;
+  }
+
+  @Override
+  protected void doRun() {
+    TextRulerToolkit.log("-- WIEN START");
+
+    headTailCache.clear();
+    interTupelSeparatorsCache.clear();
+
+    for (int i = 0; i < slotNames.length; i++)
+      patternPairs.add(new PatternPair());
+
+    TextRulerTarget target = new TextRulerTarget(slotNames, this);
+
+    exampleDocuments.createExamplesForTarget(target); // new multislot
+    // target examples
+
+    for (TextRulerExample e : exampleDocuments.getAllPositiveExamples()) {
+      TextRulerToolkit.log("Example found: " + e);
+    }
+
+    try {
+      boolean allOk = true;
+      sendStatusUpdateToDelegate("Searching for right patterns...",
+              TextRulerLearnerState.ML_RUNNING, false);
+      if (!findRightPatterns())
+        allOk = false;
+      sendStatusUpdateToDelegate("Searching for left patterns...",
+              TextRulerLearnerState.ML_RUNNING, false);
+      if (!findLeftPatterns())
+        allOk = false;
+      sendStatusUpdateToDelegate("Searching for head, tail and left1 patterns...",
+              TextRulerLearnerState.ML_RUNNING, false);
+      if (!findHeadTailAndL1Patterns())
+        allOk = false;
+      // {
+      // String s = "";
+      // for (TextRulerRuleItem i : hPattern)
+      // s += " "+i;
+      // s += " ||||";
+      // for (TextRulerRuleItem i : patternPairs.get(0).l)
+      // s += " "+i;
+      // s += " ||||";
+      // for (TextRulerRuleItem i : tPattern)
+      // s += " "+i;
+      // TextRulerToolkit.log(s);
+      // }
+
+      if (allOk) {
+        sendStatusUpdateToDelegate("Building multi-slot rule.", TextRulerLearnerState.ML_RUNNING,
+                false);
+        theRule = new WienRule(this, target);
+        List<TextRulerSlotPattern> rPatterns = theRule.getPatterns();
+        int totalItemCount = 0;
+        for (int k = 0; k < slotNames.length; k++) {
+          WienRuleItem slotItem = new WienRuleItem((TextRulerAnnotation) null);
+          TextRulerSlotPattern rP = new TextRulerSlotPattern();
+          rPatterns.add(rP);
+          PatternPair p = patternPairs.get(k);
+          for (int i = 0; i < p.l.size(); i++) {
+            WienRuleItem item = (WienRuleItem) p.l.get(i);
+            if (k == 0 && i == 0) // the very first rule item:
+            {
+              item = item.copy();
+              // old version:
+              // item.addCondition("-NEAR,wien_tail,10000000,false");
+              item.addCondition("-AFTER(wien_tail)");
+              item.addCondition("-PARTOF(wien_rulemark)");
+            }
+            rP.preFillerPattern.add(item);
+            totalItemCount++;
+          }
+          rP.fillerPattern.add(slotItem.copy());
+          totalItemCount++;
+          for (int i = 0; i < p.r.size(); i++) {
+            WienRuleItem item = (WienRuleItem) p.r.get(i);
+            totalItemCount++;
+            if (k == slotNames.length - 1 && i == p.r.size() - 1) // the
+            // very
+            // last
+            // item
+            {
+              item = item.copy();
+              item.addAction("MARK(wien_rulemark, 1, " + totalItemCount + ")");
+            }
+            rP.postFillerPattern.add(item);
+          }
+          totalItemCount++; // the inter-slot ALL*? item has to be
+          // counted as well!
+        }
+        sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+      } else
+        sendStatusUpdateToDelegate("Done - Not all patterns could be learned!",
+                TextRulerLearnerState.ML_DONE, true);
+    } catch (Exception e) {
+      e.printStackTrace();
+      sendStatusUpdateToDelegate("Aborted due to Exception!", TextRulerLearnerState.ML_ERROR, true);
+    }
+    headTailCache.clear();
+    interTupelSeparatorsCache.clear();
+    TextRulerToolkit.log("-- WIEN END");
+  }
+
+  protected boolean findRightPatterns() {
+    TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
+    boolean allFound = true;
+    for (int k = 0; k < slotNames.length; k++) {
+      List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
+      System.out.println(rightContexts.get(0));
+      int shortest = Integer.MAX_VALUE;
+      for (TextRulerRulePattern p : rightContexts)
+        shortest = p.size() < shortest ? p.size() : shortest;
+      boolean found = false;
+      for (int len = 1; len <= shortest; len++) {
+        TextRulerRulePattern subPattern = rightContexts.get(0).subPattern(0, len);
+        if (testConstraint1(subPattern, k)) {
+          // for (TextRulerRuleItem i : subPattern)
+          // ((WienRuleItem)i).getWordConstraint().setGeneralizeLinkMarkUp(true);
+          patternPairs.get(k).r = subPattern;
+          TextRulerToolkit.log("right " + k + ": " + subPattern);
+          found = true;
+          break;
+        }
+      }
+      if (!found)
+        allFound = false;
+    }
+    return allFound;
+  }
+
+  protected boolean findLeftPatterns() {
+    TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
+    // skip l 0 !
+    if (slotNames.length < 2)
+      return true;
+    boolean allFound = true;
+    for (int k = 1; k < slotNames.length; k++) {
+      List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
+      int shortest = Integer.MAX_VALUE;
+      for (TextRulerRulePattern p : leftContexts)
+        shortest = p.size() < shortest ? p.size() : shortest;
+      TextRulerRulePattern sourcePattern = leftContexts.get(0);
+      boolean found = false;
+      for (int len = 1; len <= shortest; len++) {
+        // get suffix:
+        TextRulerRulePattern subPattern = sourcePattern.subPattern(sourcePattern.size() - len, len);
+        if (testConstraint2(subPattern, k)) {
+          patternPairs.get(k).l = subPattern;
+          for (TextRulerRuleItem i : subPattern)
+            ((WienRuleItem) i).getWordConstraint().setGeneralizeLinkMarkUp(true);
+          TextRulerToolkit.log("left " + k + ": " + subPattern);
+          found = true;
+          break;
+        }
+      }
+      if (!found)
+        allFound = false;
+    }
+    return allFound;
+  }
+
+  protected boolean findHeadTailAndL1Patterns() {
+    List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
+    TextRulerExampleDocument doc0 = docs.get(0);
+    TextRulerRulePattern head = new TextRulerRulePattern();
+    TextRulerRulePattern tail = new TextRulerRulePattern();
+    getPageHeadAndTailPortion(doc0, head, tail);
+
+    final class HLCandidate {
+      public TextRulerRulePattern head = new TextRulerRulePattern();
+
+      public TextRulerRulePattern l1 = new TextRulerRulePattern();
+    }
+
+    // a small optimization:
+    // find out the maximum possible length for l1 in doc0 since l1 is much
+    // smaller than the possible head length!
+    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
+    int shortestL1 = head.size() - 1;
+    for (TextRulerRulePattern its : interTupleSeparators)
+      shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;
+
+    List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
+    // create candidates for each separation of the head and tail patterns:
+    for (int separator = head.size() - 1; separator > 0; separator--) {
+      HLCandidate c = new HLCandidate();
+      for (int i = 0; i < head.size(); i++) {
+        if (i < separator)
+          c.head.add(head.get(i));
+        else {
+          WienRuleItem it = (WienRuleItem) head.get(i).copy();
+          it.getWordConstraint().setGeneralizeLinkMarkUp(true);
+          c.l1.add(it);
+        }
+      }
+      hlCandidates.add(c);
+      TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
+      if (c.l1.size() >= shortestL1)
+        break;
+    }
+
+    long total = 0;
+
+    // get total h l1 t combination count:
+    long tCand = (tail.size() * (tail.size() + 1)) / 2;
+    for (HLCandidate c : hlCandidates) {
+      total += ((c.head.size() - 1) * (c.head.size())) / 2;
+    }
+    total *= tCand;
+
+    long current = 0;
+    int oldPercent = -1;
+
+    for (HLCandidate c : hlCandidates) {
+      // for each "candidate" which represents a l1 suffix pattern of the
+      // head tokens and a rest pattern for the h pattern,
+      // we have to create every sub pattern of the remaining h pattern as
+      // a h candidate:
+      TextRulerRulePattern l1 = c.l1;
+      TextRulerRulePattern h = null;
+
+      boolean l1Sucks = false;
+
+      for (int endI = c.head.size() - 1; endI > 0; endI--) {
+        for (int startI = endI; startI > 0; startI--) {
+          h = new TextRulerRulePattern();
+          for (int i = startI; i <= endI; i++)
+            h.add(c.head.get(i));
+
+          // now for each h candidate we have to create each t
+          // candidate:
+          TextRulerRulePattern t = null;
+          for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
+            for (int tendI = tstartI; tendI < tail.size(); tendI++) {
+              int percent = Math.round(((float) current * 100 / total));
+              if (percent != oldPercent) {
+                oldPercent = percent;
+                if (percent > 100)
+                  percent = 100;
+                // TextRulerToolkit.log(current+" / "+total);
+                sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
+                        TextRulerLearnerState.ML_RUNNING, false);
+              }
+              if (shouldAbort())
+                return false;
+              current++;
+
+              t = new TextRulerRulePattern();
+              for (int i = tstartI; i <= tendI; i++)
+                t.add(tail.get(i));
+
+              // no we have a possible candidate triple: h, t and
+              // l1:
+
+              constraint3ReturnType c3Result = testConstraint3(h, t, l1);
+
+              if (c3Result == constraint3ReturnType.C3_SUCCESS) {
+                hPattern = h;
+                tPattern = t;
+                patternPairs.get(0).l = l1;
+                return true;
+              } else if (c3Result == constraint3ReturnType.C3_L1CandidateSuffixError
+                      || c3Result == constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError) {
+                l1Sucks = true;
+                current += tail.size() - tendI - 1;
+                break;
+              } else if (c3Result == constraint3ReturnType.C3_TailCandidateH_L1Error
+                      || c3Result == constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError) {
+                // no special pruning options here... we simply
+                // have to test the next t-candidate
+              } else if (c3Result == constraint3ReturnType.C3_TailCandidateRK_PrefixError
+                      || c3Result == constraint3ReturnType.C3_TailCandidateNotFoundError) {
+                // all candidates with the same start item are
+                // bad, so leave this inner loop:
+                current += tail.size() - tendI - 1;
+                break;
+              } else if (c3Result == constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError) {
+                // this is a problematic case... the cause could
+                // be L1 or the current Tail pattern,
+                // so we can't do nothing about it! just try the
+                // next t-candidate
+              }
+            }
+            if (l1Sucks) {
+              current += (tail.size() - tstartI - 1) * (tail.size() - tstartI) / 2;
+              break;
+            }
+          }
+          if (l1Sucks) {
+            if (startI > 0)
+              current += (startI - 1) * tCand;
+            break;
+          }
+        }
+        if (l1Sucks) {
+          current += (endI * (endI + 1) / 2) * tCand;
+          break;
+        }
+      }
+    }
+    return false;
+  }
+
+  protected void getPageHeadAndTailPortion(TextRulerExampleDocument doc, TextRulerRulePattern head,
+          TextRulerRulePattern tail) {
+    String key = doc.getCasFileName();
+    if (headTailCache.containsKey(key)) {
+      PatternPair p = headTailCache.get(key);
+      head.addAll(p.l);
+      tail.addAll(p.r);
+    } else {
+      CAS cas = doc.getCAS();
+      TextRulerExample firstExample = doc.getPositiveExamples().get(0);
+      TextRulerExample lastExample = doc.getPositiveExamples().get(
+              doc.getPositiveExamples().size() - 1);
+      TypeSystem ts = cas.getTypeSystem();
+      Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+      List<AnnotationFS> headTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas,
+              firstExample.getAnnotations()[0].getBegin(), 0, TextRulerToolkit
+                      .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+      TextRulerAnnotation[] lastExampleAnnotations = lastExample.getAnnotations();
+      List<AnnotationFS> tailTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas,
+              lastExampleAnnotations[lastExampleAnnotations.length - 1].getEnd(), 0,
+              TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+      for (AnnotationFS afs : headTokens)
+        head.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+      for (AnnotationFS afs : tailTokens)
+        tail.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+      PatternPair p = new PatternPair();
+      p.l.addAll(head);
+      p.r.addAll(tail);
+      headTailCache.put(key, p);
+    }
+  }
+
+  protected List<TextRulerRulePattern> getInterTupleSepatators(TextRulerExampleDocument doc) {
+    String key = doc.getCasFileName();
+    if (interTupelSeparatorsCache.containsKey(key)) {
+      return interTupelSeparatorsCache.get(key);
+    } else {
+      List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+      CAS cas = doc.getCAS();
+      TypeSystem ts = cas.getTypeSystem();
+      Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+      List<TextRulerExample> examples = doc.getPositiveExamples();
+      for (int i = 0; i < examples.size() - 1; i++) {
+        // get separator between i'th and (i+1)'th example:
+        TextRulerAnnotation[] exampleAnnotations1 = examples.get(i).getAnnotations();
+        TextRulerAnnotation[] exampleAnnotations2 = examples.get(i + 1).getAnnotations();
+        TextRulerAnnotation lastOf1 = exampleAnnotations1[exampleAnnotations1.length - 1];
+        TextRulerAnnotation firstOf2 = exampleAnnotations2[0];
+        List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, lastOf1
+                .getEnd(), firstOf2.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
+                slotNames, filterSet), tokenType);
+        TextRulerRulePattern thePattern = new TextRulerRulePattern();
+        for (AnnotationFS afs : theTokens)
+          thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+        if (thePattern.size() > 0)
+          result.add(thePattern);
+
+      }
+      interTupelSeparatorsCache.put(key, result);
+      return result;
+    }
+  }
+
+  protected List<TextRulerRulePattern> getRightContextForSlot(TextRulerExampleDocument doc,
+          int slotIndex) {
+    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+    CAS cas = doc.getCAS();
+    TypeSystem ts = cas.getTypeSystem();
+    Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+    List<TextRulerExample> examples = doc.getPositiveExamples();
+    boolean isLastSlot = slotIndex >= slotNames.length - 1;
+    for (int ei = 0; ei < examples.size(); ei++) {
+      boolean isLastExample = ei == examples.size() - 1;
+      TextRulerExample e = examples.get(ei);
+      // get stuff between slot slotIndex and slotIndex+1
+      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+      TextRulerAnnotation nextSlotAnnotation;
+
+      if (!isLastSlot)
+        nextSlotAnnotation = e.getAnnotations()[slotIndex + 1];
+      else {
+        if (!isLastExample) // the next slot annotation is the first
+          // example annotation of the next template:
+          nextSlotAnnotation = examples.get(ei + 1).getAnnotations()[0];
+        else
+          nextSlotAnnotation = null;
+      }
+
+      List<AnnotationFS> theTokens;
+      if (nextSlotAnnotation == null)
+        theTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas, slotAnnotation.getEnd(), 0,
+                TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+      else
+        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, slotAnnotation.getEnd(),
+                nextSlotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
+                        slotNames, filterSet), tokenType);
+      TextRulerRulePattern thePattern = new TextRulerRulePattern();
+      for (AnnotationFS afs : theTokens)
+        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+      if (thePattern.size() > 0)
+        result.add(thePattern);
+    }
+    return result;
+  }
+
+  protected List<TextRulerRulePattern> getLeftContextForSlot(TextRulerExampleDocument doc,
+          int slotIndex) {
+    if (slotIndex == 0)
+      return null;
+    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+    CAS cas = doc.getCAS();
+    TypeSystem ts = cas.getTypeSystem();
+    Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+    List<TextRulerExample> examples = doc.getPositiveExamples();
+
+    boolean isFirstSlot = slotIndex == 0;
+    for (int ei = 0; ei < examples.size(); ei++) {
+      boolean isFirstExample = ei == 0;
+      TextRulerExample e = examples.get(ei);
+      // get stuff between slot slotIndex and slotIndex+1
+      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+      TextRulerAnnotation prevSlotAnnotation;
+
+      if (!isFirstSlot)
+        prevSlotAnnotation = e.getAnnotations()[slotIndex - 1];
+      else {
+        if (!isFirstExample)
+          prevSlotAnnotation = examples.get(ei - 1).getAnnotations()[slotNames.length - 1];
+        else
+          prevSlotAnnotation = null;
+      }
+
+      List<AnnotationFS> theTokens;
+      if (prevSlotAnnotation == null)
+        theTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas, slotAnnotation.getBegin(),
+                0, TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+      else
+        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, prevSlotAnnotation.getEnd(),
+                slotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+                        filterSet), tokenType);
+      TextRulerRulePattern thePattern = new TextRulerRulePattern();
+      for (AnnotationFS afs : theTokens)
+        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc), true));
+      if (thePattern.size() > 0)
+        result.add(thePattern);
+    }
+    return result;
+  }
+
+  protected List<TextRulerRulePattern> getSlotFillerPatterns(TextRulerExampleDocument doc,
+          int slotIndex) {
+    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+    CAS cas = doc.getCAS();
+    TypeSystem ts = cas.getTypeSystem();
+    Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+    List<TextRulerExample> examples = doc.getPositiveExamples();
+    for (TextRulerExample e : examples) {
+      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+      List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas,
+              slotAnnotation.getBegin(), slotAnnotation.getEnd(), TextRulerToolkit
+                      .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+      TextRulerRulePattern thePattern = new TextRulerRulePattern();
+      for (AnnotationFS afs : theTokens)
+        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+      if (thePattern.size() > 0)
+        result.add(thePattern);
+    }
+    return result;
+  }
+
+  protected constraint3ReturnType testConstraint3(TextRulerRulePattern h, TextRulerRulePattern t,
+          TextRulerRulePattern l1) {
+    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+      constraint3ReturnType r = testConstraint3(doc, h, t, l1);
+      if (r != constraint3ReturnType.C3_SUCCESS)
+        return r;
+    }
+    return constraint3ReturnType.C3_SUCCESS;
+  }
+
+  protected boolean testConstraint1(TextRulerExampleDocument doc, TextRulerRulePattern rk, int k) {
+    List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
+    for (TextRulerRulePattern rx : rightContexts) {
+      if (rx.find(rk) != 0)
+        return false;
+    }
+    List<TextRulerRulePattern> contents = getSlotFillerPatterns(doc, k);
+    for (TextRulerRulePattern c : contents) {
+      if (c.find(rk) >= 0)
+        return false;
+    }
+
+    return true;
+  }
+
+  protected boolean testConstraint1(TextRulerRulePattern rk, int k) {
+    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+      if (!testConstraint1(doc, rk, k))
+        return false;
+    }
+    return true;
+  }
+
+  protected boolean testConstraint2(TextRulerExampleDocument doc, TextRulerRulePattern lk, int k) {
+    List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
+    for (TextRulerRulePattern lx : leftContexts) {
+      if (lx.size() < lk.size())
+        return false;
+      int pos = lx.find(lk);
+      if (pos < 0 || pos != lx.size() - lk.size())
+        return false;
+    }
+    return true;
+  }
+
+  protected boolean testConstraint2(TextRulerRulePattern lk, int k) {
+    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+      if (!testConstraint2(doc, lk, k))
+        return false;
+    }
+    return true;
+  }
+
+  protected constraint3ReturnType testConstraint3(TextRulerExampleDocument doc,
+          TextRulerRulePattern h, TextRulerRulePattern t, TextRulerRulePattern l1) {
+    final boolean logReasons = false;
+
+    TextRulerRulePattern head = new TextRulerRulePattern();
+    TextRulerRulePattern tail = new TextRulerRulePattern();
+
+    getPageHeadAndTailPortion(doc, head, tail);
+
+    // 1: l1 must be a proper suffix of the portion between the end of h and
+    // the first slot filler:
+    // (head / h) / l1 = l1
+
+    int hPos = head.find(h);
+
+    // TOOD precalculate this outside this method ?
+    TextRulerRulePattern restForL1 = head.subPattern(hPos + h.size(), -1).copy();
+    for (TextRulerRuleItem it : restForL1)
+      ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
+    int l1Pos = restForL1.find(l1);
+    if (l1Pos < 0 || l1Pos != restForL1.size() - l1.size()) {
+      TextRulerToolkit.logIf(logReasons, "REASON 1\n\tl1         \t" + l1 + "\n\trestforl1\t"
+              + restForL1);
+      return constraint3ReturnType.C3_L1CandidateSuffixError;
+    }
+
+    // 2: t must not occur in the subpattern after h and before l1
+    if (l1Pos > 0) {
+      TextRulerRulePattern patternBetweenHandL1 = restForL1.subPattern(0, l1Pos);
+      if (patternBetweenHandL1.size() >= t.size()) {
+        if (patternBetweenHandL1.find(t) >= 0) {
+          TextRulerToolkit.logIf(logReasons, "REASON 2");
+          return constraint3ReturnType.C3_TailCandidateH_L1Error;
+        }
+      }
+    }
+
+    // 2a: addons, not specified in WIEN paper !!
+    TextRulerRulePattern lastSlotRightPattern = patternPairs.get(slotNames.length - 1).r;
+    if (t.find(lastSlotRightPattern) == 0) // the right boundary of the last
+    // slot may not be part of the
+    // tail pattern!
+    {
+      TextRulerToolkit.logIf(logReasons, "REASON 3: " + lastSlotRightPattern + "\tTail: " + t);
+      return constraint3ReturnType.C3_TailCandidateRK_PrefixError;
+    }
+
+    int tPos = tail.find(t);
+    if (tPos < 0) {
+      TextRulerToolkit.logIf(logReasons, "REASON 4");
+      return constraint3ReturnType.C3_TailCandidateNotFoundError;
+    } // this is an own constraint definition: if a document does not have
+    // the tail in it,
+    // what should we do then ? is this a n error or is this okay since the
+    // document may not have any tail after the data ?
+
+    // 3: l1 must not precede t in the page's tail:
+    int l1tPos = tail.find(l1);
+    if (l1tPos >= 0) // l1 occurs in the page's tail:
+    {
+      if (l1tPos < tPos) {
+        TextRulerToolkit.logIf(logReasons, "REASON 5");
+        return constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError;
+      }
+    }
+
+    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc);
+
+    for (TextRulerRulePattern itSep : interTupleSeparators) {
+      // 4: l1 must be a proper suffix of each of the inter-tuple
+      // separators:
+      TextRulerRulePattern itSepCopy = itSep.copy();
+      for (TextRulerRuleItem it : itSepCopy)
+        ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
+      int l1itsPos = itSepCopy.find(l1);
+      if (l1itsPos < 0 || l1itsPos != itSepCopy.size() - l1.size()) {
+        TextRulerToolkit.logIf(logReasons, "REASON 6: \n\tl1\t" + l1 + "\n\titSep\t" + itSep);
+        return constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError;
+      }
+
+      // 5: t must never precede l1 in any inter-tuple separator:
+      int itstPos = itSep.find(t);
+      if (itstPos >= 0 && itstPos < l1itsPos) {
+        TextRulerToolkit.logIf(logReasons, "REASON 7");
+        return constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError;
+      }
+
+    }
+    return constraint3ReturnType.C3_SUCCESS;
+  }
+
+  public String getResultString() {
+    if (theRule == null)
+      return "<no results yet>";
+    String result = getTMFileHeaderString() + "DECLARE wien_tail;\n" + "DECLARE wien_rulemark;\n"
+            + "DECLARE wien_content;\n" + "BOOLEAN wien_redo;\n\n"
+            + "// tail/head/content area stuff:\n";
+
+    TextRulerRulePattern hCopy = hPattern.copy();
+
+    ((WienRuleItem) hCopy.get(0)).addCondition("-PARTOF(wien_content)");
+    result += hCopy + " ALL*?{->MARK(wien_content)};\n";
+
+    TextRulerRulePattern tCopy = tPattern.copy();
+    ((WienRuleItem) tCopy.get(0)).addCondition("PARTOF(wien_content)");
+
+    result += tCopy + "{->MARK(wien_tail";
+    if (tPattern.size() > 1)
+      result += ", 1, " + tPattern.size();
+    result += ")};\n\n";
+
+    result += "BLOCK(findData) wien_content {\n"
+            + "\t// find out if tail is before the next occurence of l1\n"
+            + "\t"
+            + theRule.getRuleString()
+            + "\n"
+            + "\tDocument{->ASSIGN(wien_redo, false)};\n"
+            + "\twien_tail{PARTOF(wien_rulemark)->UNMARK(wien_tail), ASSIGN(wien_redo, true)}; // remove tail marks that are no longer relevant for us after the last rule !\n"
+            + "\tDocument{IF(wien_redo)->CALL(filename.findData)};\n" + "}\n";
+
+    result += "\n// cleaning up:\n" + "wien_tail{->UNMARK(wien_tail)};\n"
+            + "wien_rulemark{->UNMARK(wien_rulemark)};\n"
+            + "wien_content{->UNMARK(wien_content)};\n";
+    return result;
+  }
+
+  public void setParameters(Map<String, Object> params) {
+  }
+
+}

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain