You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/12 12:32:54 UTC
svn commit: r1157037 [8/10] - in
/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler: ./ .settings/
META-INF/ icons/ schema/ src/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/uima/ src/main/jav...
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,670 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleList;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.learner.whisk.token.WhiskRuleItem.MLWhiskOtherConstraint;
+
+public class Whisk extends TextRulerBasicLearner {
+
+ public final static String WINDOSIZE_KEY = "windowSize";
+
+ public final static String ERROR_THRESHOLD_KEY = "errorThreshold";
+
+ public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";
+
+ public final static int STANDARD_WINDOWSIZE = 5;
+
+ public final static float STANDARD_ERROR_THRESHOLD = 0.1f;
+
+ public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";
+
+ TextRulerRuleList ruleList;
+
+ protected Set<TextRulerExample> coveredExamples;
+
+ protected int windowSize = STANDARD_WINDOWSIZE;
+
+ protected double errorThreshold = STANDARD_ERROR_THRESHOLD;
+
+ protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;
+
+ int roundNumber = 0;
+
+ int allExamplesCount = 0;
+
+ private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();
+
+ public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, delegate);
+ }
+
+ @Override
+ public boolean collectNegativeCoveredInstancesWhenTesting() {
+ return false;
+ }
+
+ @Override
+ protected void doRun() {
+
+ // we don't use the same overall structure like the original WHISK since
+ // we do not
+ // repeat the whole process for some new training documents at the
+ // user's request, we
+ // learn like the other algorithms from the whole training set, so we
+ // for example do not
+ // need to test the intermediate rule base on a newly "incoming"
+ // training document since we
+ // tested all rules already on all training documents !
+
+ // this version of whisk is not tested for mutli slot learning since the
+ // seminar announcements
+ // are not quite suitable for this task: they do not all contain all 4
+ // slots and some of them
+ // occur more than once in one document ! And the order of them is not
+ // always the same as well!
+ // so this is now made only tested for the single slot case even if it
+ // is built capable of multislot
+ // examples!
+
+ // this is the inner loop of the WHISK pseudo-code:
+ // For each inst in Training
+ // for each tag
+
+ cachedTestedRuleStatistics.clear();
+ ruleList = new TextRulerRuleList();
+ coveredExamples = new HashSet<TextRulerExample>();
+
+ sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
+ TextRulerTarget target = new TextRulerTarget(slotNames[0], this); // only
+ // single-slot-target
+ // for now
+ exampleDocuments.createExamplesForTarget(target);
+
+ TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();
+
+ allExamplesCount = exampleDocuments.getAllPositiveExamples().size();
+
+ for (TextRulerExampleDocument inst : docs) {
+ List<TextRulerExample> tags = inst.getPositiveExamples();
+
+ // for each uncovered example -> induce a new rule:
+ for (TextRulerExample tag : tags) {
+ if (!coveredExamples.contains(tag)) {
+ roundNumber++;
+ WhiskRule newRule = growRule(inst, tag);
+ if (shouldAbort())
+ break;
+ // if (newRule == null)
+ // break;
+ // else
+ if (newRule != null
+ && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
+ .getLaplacian() <= errorThreshold)) {
+ ruleList.addRule(newRule);
+ coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
+ sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING, true);
+ }
+ }
+ }
+ if (shouldAbort())
+ return;
+ }
+ sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+ cachedTestedRuleStatistics.clear();
+ }
+
+ protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
+ sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
+ false);
+ WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
+ int numberOfSlotsInTag = example.getAnnotations().length;
+ for (int i = 0; i < numberOfSlotsInTag; i++)
+ theRule.getPatterns().add(new TextRulerSlotPattern());
+
+ List<WhiskRuleItem> allTerms = getAllTermsOfExample(example);
+
+ sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
+ false);
+ for (int i = 0; i < numberOfSlotsInTag; i++) {
+ theRule = anchor(theRule, doc, example, allTerms, i);
+ if (shouldAbort())
+ return null;
+ }
+
+ sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
+ false);
+ if (theRule != null) {
+ double oldLaplacian = theRule.getLaplacian();
+ int subRoundNumber = 0;
+ // repeat while we still make errors...
+ while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
+ WhiskRule extendedRule = extendRule(theRule, doc, example, allTerms, subRoundNumber);
+ if (extendedRule == null) {
+ // this way we get the previous rule
+ // as the best rule...
+ break;
+ }
+ theRule = extendedRule;
+ TextRulerToolkit.log("----------------------------");
+ TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
+ TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + " ; "
+ + theRule.getCoveringStatistics());
+ subRoundNumber++;
+
+ double newLaplacian = theRule.getLaplacian();
+ if (newLaplacian >= oldLaplacian) {
+ break;
+ }
+ oldLaplacian = newLaplacian;
+ }
+ TextRulerToolkit.log("----------------------------");
+ TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
+ }
+ return theRule;
+ }
+
+ protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
+ TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
+ WhiskRule bestRule = null;
+ double bestL = 1.0;
+ int bestRuleConstraintPoints = -1;
+ if (rule.getLaplacian() <= errorThreshold) {
+ bestRule = rule;
+ bestL = rule.getLaplacian();
+ }
+
+ List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
+ example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
+ WhiskRuleItem firstSlotTerm = slotTerms.get(0);
+ WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);
+
+ List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
+ for (WhiskRuleItem term : allTerms) {
+ if (rule.containsTerm(term)) {
+ continue;
+ }
+
+ boolean rejectTerm = false;
+ // for now this works only for slot 0 (no multislot stuff here yet!)
+ if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
+ rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
+ else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
+ rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;
+
+ if (rejectTerm) {
+ // out of window scope -> skip to next...
+ continue;
+ }
+
+ WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
+ WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());
+
+ if (!rulesToTest.contains(proposedRule))
+ rulesToTest.add(proposedRule);
+
+ // add a second version where we remove the exact token content if
+ // it is a regexp item:
+ WhiskRule proposedRule2 = null;
+ WhiskRuleItem t2 = null;
+ if (t.getWordConstraint().isRegExpConstraint()) {
+ proposedRule2 = proposedRule.copy();
+ t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
+ t2.setHideRegExp(true);
+ proposedRule2.setNeedsCompile(true);
+ if (!rulesToTest.contains(proposedRule2)) {
+ rulesToTest.add(proposedRule2);
+ }
+ }
+
+ // and now, for WHISK performance testing purposes, we also add POS
+ // tags:
+ // this is not very nice code and not dynamic feature capable, but
+ // for testpurposes
+ // in order to test WHISK with PosTag Terms...
+ if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
+ TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
+ CAS cas = example.getDocumentCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type posTagsRootType = ts.getType(posTagRootTypeName);
+ if (ts != null) {
+ // POS-Tags created by our test hmm tagger.
+ List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
+ tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
+ if (posTagAnnotations.size() > 0) {
+ AnnotationFS posTag = posTagAnnotations.get(0);
+ if (posTag.getBegin() == tokenAnnotation.getBegin()
+ && posTag.getEnd() == tokenAnnotation.getEnd()) {
+ TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);
+
+ // 1. most specific term with all constraints we
+ // have:
+ WhiskRule proposedRule3 = proposedRule.copy();
+ WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
+ .getTermNumberInExample());
+ t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+ proposedRule3.setNeedsCompile(true);
+ if (!rulesToTest.contains(proposedRule3))
+ rulesToTest.add(proposedRule3);
+
+ // 2. the same without the regexp thingy:
+ if (proposedRule2 != null) {
+ WhiskRule proposedRule4 = proposedRule2.copy();
+ WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
+ .getTermNumberInExample());
+ t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+ proposedRule4.setNeedsCompile(true);
+ if (!rulesToTest.contains(proposedRule4))
+ rulesToTest.add(proposedRule4);
+ }
+
+ // 3. last but not least: a rule with only the pos
+ // tag constraint:
+ WhiskRule proposedRule5 = proposedRule.copy();
+ WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
+ .getTermNumberInExample());
+ t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
+ t5.setWordConstraint(null);
+ proposedRule5.setNeedsCompile(true);
+ if (!rulesToTest.contains(proposedRule5))
+ rulesToTest.add(proposedRule5);
+ }
+ }
+ }
+ }
+
+ }
+ if (rulesToTest.size() == 0)
+ return bestRule;
+
+ sendStatusUpdateToDelegate(
+ "Round "
+ + roundNumber
+ + "."
+ + subRoundNumber
+ + " - Testing "
+ + rulesToTest.size()
+ + " rules... "
+ + " - uncovered examples: "
+ + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+ + " ; cs=" + cachedTestedRuleStatistics.size()),
+ TextRulerLearnerState.ML_RUNNING, false);
+
+ TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
+ for (TextRulerRule r : rulesToTest)
+ TextRulerToolkit.log(r.getRuleString());
+ testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
+ // exampleDocuments);
+ if (shouldAbort())
+ return null;
+ for (TextRulerRule r : rulesToTest) {
+ WhiskRule wr = (WhiskRule) r;
+ if (wr.getLaplacian() < bestL) {
+ bestL = wr.getLaplacian();
+ bestRule = wr;
+ bestRuleConstraintPoints = bestRule.totalConstraintPoints();
+ } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
+ TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
+ if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
+ TextRulerToolkit.log("\tYes, prefered!");
+ bestL = wr.getLaplacian();
+ bestRule = wr;
+ bestRuleConstraintPoints = bestRule.totalConstraintPoints();
+ }
+ }
+ }
+ return bestRule;
+ }
+
+ protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
+ WhiskRule newRule = baseRule.copy();
+ int foundSlotNumber = -1; // debug info
+ String foundSlotPattern = "";
+ int termNumber = term.getTermNumberInExample();
+ // determine, where this term is located relatively to the slots we
+ // have...
+ TextRulerRulePattern targetPattern = null;
+ TextRulerRulePattern previousSlotPostFillerPattern = null;
+ for (int i = 0; i < newRule.getPatterns().size(); i++) {
+ TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
+ WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
+ // prefiller
+ // pattern
+ if (it != null && termNumber <= it.getTermNumberInExample())
+ targetPattern = slotPattern.preFillerPattern;
+ if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
+ // look
+ // at
+ // the
+ // filler
+ // pattern
+ {
+ it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
+ if (termNumber < it.getTermNumberInExample()) // it's still for
+ // the prefiller
+ // pattern but it
+ // seems to be
+ // emtpy so we
+ // could not find
+ // that out above!
+ targetPattern = slotPattern.preFillerPattern;
+ else {
+ it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
+ if (termNumber <= it.getTermNumberInExample()) {
+ targetPattern = slotPattern.fillerPattern;
+ }
+ }
+ }
+ if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
+ // the
+ // postfiller
+ // pattern
+ {
+ it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
+ if (termNumber < it.getTermNumberInExample()) // it's still for
+ // the filler
+ // pattern but it
+ // seems to be
+ // emtpy so we
+ // could not find
+ // that out above!
+ targetPattern = slotPattern.fillerPattern;
+ else {
+ it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
+ if (termNumber <= it.getTermNumberInExample())
+ targetPattern = slotPattern.postFillerPattern;
+ }
+ }
+ if (targetPattern == null) {
+ targetPattern = previousSlotPostFillerPattern;
+ if (i > 0) {
+ TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
+ foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
+ : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
+ foundSlotNumber = i - 1;
+ }
+ } else {
+ foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
+ : (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
+ foundSlotNumber = i;
+ }
+ previousSlotPostFillerPattern = slotPattern.postFillerPattern;
+ }
+
+ if (targetPattern == null) {
+ targetPattern = previousSlotPostFillerPattern;
+ foundSlotNumber = newRule.getPatterns().size() - 1;
+ foundSlotPattern = "POST FILLER";
+ }
+
+ if (targetPattern == null) {
+ TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
+ } else {
+ // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
+ // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
+ // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
+ // now put that term into the rule:
+ int indexInPattern = -1;
+ if (targetPattern.size() == 0) {
+ targetPattern.add(term.copy());
+ indexInPattern = 0;
+ } else {
+ // 1. search if the term would replace a wildcard:
+ WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
+ if (wildCard != null) {
+ if (!wildCard.isStarWildCard()) {
+ TextRulerToolkit
+ .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
+ return null;
+ }
+ if (!targetPattern.contains(wildCard)) {
+ TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
+ return null;
+ }
+ indexInPattern = targetPattern.indexOf(wildCard);
+ targetPattern.set(indexInPattern, term.copy());
+ } else {
+ // not a wildcard, so search for the insertion point:
+ for (int i = 0; i < targetPattern.size(); i++) {
+ WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
+ if (termNumber < it.getTermNumberInExample()) {
+ indexInPattern = i;
+ break;
+ }
+ }
+ if (indexInPattern < 0) {
+ indexInPattern = targetPattern.size();
+ targetPattern.add(term.copy());
+ } else
+ targetPattern.add(indexInPattern, term.copy());
+ }
+ }
+ // ok, now we have replaced a wildcard with the term or added the
+ // term between two other items.
+ // we now have to check the neighbors of the new term: if it is a
+ // direct neighbor (according to the termNumber),
+ // we have nothing special to do. but if it is not a direct
+ // neighbor, we have to add a wildcard between the two items (if the
+ // neighbor item
+ // is not a wildcard itself!
+ WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);
+
+ // look at left neighbor:
+ WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
+ if (left != null) {
+ // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");
+
+ // so we have a left neighbor. let's see if it also is the
+ // neighbor in our seed token stream:
+ if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
+ && !left.isStarWildCard()) { // no direct neighbor and
+ // no wildcard yet,
+ // so insert a wildcard between us!
+ targetPattern.add(indexInPattern,
+ WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
+ indexInPattern++;
+ }
+ }
+
+ // look at right neighbor:
+ WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
+ if (right != null) {
+ // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
+ // so we have a right neighbor. let's see if it also is the
+ // neighbor in our seed token stream:
+ if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
+ && !right.isStarWildCard()) { // no direct neighbor and
+ // no wildcard yet,
+ // so insert a wildcard between us!
+ WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
+ if (indexInPattern + 1 < targetPattern.size())
+ targetPattern.add(indexInPattern + 1, wc);
+ else
+ targetPattern.add(wc);
+ }
+ }
+
+ newRule.setNeedsCompile(true);
+ // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
+ // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
+ // TextRulerToolkit.log("");
+ }
+ if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
+ // must
+ // not be!
+ return null;
+ else
+ return newRule;
+ }
+
+ protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
+ TextRulerExample example, List<WhiskRuleItem> allTerms, int slotIndex) {
+ TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
+ List<WhiskRuleItem> inside = getTermsWithinBounds(allTerms, slotAnnotation.getBegin(),
+ slotAnnotation.getEnd());
+
+ if (rule == null || inside.isEmpty()) {
+ return null;
+ }
+ // create base 1 and base 2:
+ WhiskRule base1 = rule.copy(); // slot filler rule
+ TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
+ for (int i = 0; i < inside.size(); i++)
+ if (i == 0 || (i == inside.size() - 1))
+ slotPattern.fillerPattern.add(inside.get(i).copy());
+ else if (inside.size() > 2 && i < 2)
+ slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(i)
+ .getTermNumberInExample()));
+
+ WhiskRule base2 = rule.copy(); // slot context rule
+ slotPattern = base2.getPatterns().get(slotIndex);
+
+ int firstOfSlot = allTerms.indexOf(inside.get(0));
+ int lastOfSlot = allTerms.indexOf(inside.get(inside.size() - 1));
+ if (firstOfSlot > 0)
+ slotPattern.preFillerPattern.add(allTerms.get(firstOfSlot - 1));
+ slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(0)
+ .getTermNumberInExample()));
+ if (lastOfSlot + 1 < allTerms.size())
+ slotPattern.postFillerPattern.add(allTerms.get(lastOfSlot + 1));
+
+ TextRulerToolkit.log("base1: " + base1.getRuleString());
+ TextRulerToolkit.log("base2: " + base2.getRuleString());
+ List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
+ testRules.add(base1);
+ testRules.add(base2);
+ // testRulesOnDocumentSet(testRules, exampleDocuments);
+ testRulesIfNotCached(testRules);
+ if (shouldAbort())
+ return null;
+ TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
+ + base1.getLaplacian());
+ TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
+ + base2.getLaplacian());
+ if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1.getCoveringStatistics()
+ .getCoveredPositivesCount())
+ return base2;
+ else
+ return base1;
+ }
+
+ public String getResultString() {
+ if (ruleList != null)
+ return getTMFileHeaderString() + ruleList.getRulesString("");
+ else
+ return "No results available yet!";
+ }
+
+ public void setParameters(Map<String, Object> params) {
+ if (TextRulerToolkit.DEBUG)
+ saveParametersToTempFolder(params);
+
+ // TODO try catch
+ if (params.containsKey(WINDOSIZE_KEY))
+ windowSize = (Integer) params.get(WINDOSIZE_KEY);
+
+ if (params.containsKey(ERROR_THRESHOLD_KEY))
+ errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);
+
+ if (params.containsKey(POSTAG_ROOTTYPE_KEY))
+ posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);
+
+ }
+
+ public List<WhiskRuleItem> getAllTermsOfExample(TextRulerExample example) {
+ CAS cas = example.getDocumentCAS();
+ Type tokensRootType = cas.getTypeSystem().getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+ List<AnnotationFS> all = TextRulerToolkit.getAnnotationsWithinBounds(cas, 0, cas
+ .getDocumentText().length() + 1, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+ filterSet), tokensRootType);
+
+ List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
+ int i = 0;
+ for (AnnotationFS afs : all) {
+ WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(afs, example.getDocument()));
+ term.setTermNumberInExample(i);
+ i++;
+ result.add(term);
+ }
+ return result;
+ }
+
+ public List<WhiskRuleItem> getTermsWithinBounds(List<WhiskRuleItem> allTerms, int startPos,
+ int endPos) {
+ List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
+ for (WhiskRuleItem term : allTerms) {
+ TextRulerAnnotation a = term.getWordConstraint().getTokenAnnotation();
+ if (a.getBegin() >= startPos && a.getEnd() <= endPos)
+ result.add(term);
+ if (a.getEnd() > endPos)
+ break;
+ }
+ return result;
+ }
+
+ // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
+ // make a maximum size of the cache, etc. like CasCache?
+ protected void testRulesIfNotCached(List<TextRulerRule> rules) {
+ List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
+
+ for (TextRulerRule r : rules) {
+ String key = r.getRuleString();
+ if (cachedTestedRuleStatistics.containsKey(key)) {
+ r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
+ TextRulerToolkit.log("CACHE HIT !");
+ } else
+ rulesToTest.add(r);
+ }
+
+ if (rulesToTest.size() > 0) {
+ testRulesOnDocumentSet(rulesToTest, exampleDocuments);
+ if (shouldAbort())
+ return;
+ for (TextRulerRule r : rulesToTest) {
+ String key = r.getRuleString();
+ cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
+ }
+ }
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/Whisk.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearner;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter.MLAlgorithmParamType;
+
+public class WhiskFactory implements TextRulerLearnerFactory {
+
+ public TextRulerLearner createAlgorithm(String inputFolderPath, String additionalFolderPath,
+ String preprocessorTMfile, String tempFolderPath, String[] fullSlotTypeNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ return new Whisk(inputFolderPath, preprocessorTMfile, tempFolderPath, fullSlotTypeNames,
+ filterSet, delegate);
+ }
+
+ public Map<String, Object> getAlgorithmParameterStandardValues() {
+ Map<String, Object> result = new HashMap<String, Object>();
+ result.put(Whisk.WINDOSIZE_KEY, Whisk.STANDARD_WINDOWSIZE);
+ result.put(Whisk.ERROR_THRESHOLD_KEY, Whisk.STANDARD_ERROR_THRESHOLD);
+ result.put(Whisk.POSTAG_ROOTTYPE_KEY, Whisk.STANDARD_POSTAG_ROOTTYPE);
+ return result;
+ }
+
+ public TextRulerLearnerParameter[] getAlgorithmParameters() {
+ TextRulerLearnerParameter[] result = new TextRulerLearnerParameter[3];
+
+ result[0] = new TextRulerLearnerParameter(Whisk.WINDOSIZE_KEY, "Window Size",
+ MLAlgorithmParamType.ML_INT_PARAM);
+ result[1] = new TextRulerLearnerParameter(Whisk.ERROR_THRESHOLD_KEY, "Maximum Error Threshold",
+ MLAlgorithmParamType.ML_FLOAT_PARAM);
+ result[2] = new TextRulerLearnerParameter(Whisk.POSTAG_ROOTTYPE_KEY, "PosTag Root Type",
+ MLAlgorithmParamType.ML_STRING_PARAM);
+
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerMultiSlotRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+
+public class WhiskRule extends TextRulerMultiSlotRule {
+
+ TextRulerExample seedExample;
+
+ public WhiskRule(WhiskRule copyFrom) {
+ super(copyFrom);
+ seedExample = copyFrom.seedExample;
+ }
+
+ public WhiskRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target,
+ TextRulerExample seedExample) {
+ super(parentAlgorithm, target);
+ this.seedExample = seedExample;
+ }
+
+ @Override
+ public WhiskRule copy() {
+ return new WhiskRule(this);
+ }
+
+ public double getLaplacian() {
+ int e = 0;
+ int n = 0;
+
+ if (coveringStatistics != null) {
+ e = coveringStatistics.getCoveredNegativesCount();
+ n = coveringStatistics.getCoveredNegativesCount()
+ + coveringStatistics.getCoveredPositivesCount();
+ }
+ return ((double) e + 1) / ((double) n + 1);
+ }
+
+ public TextRulerExample getSeedExample() {
+ return seedExample;
+ }
+
+ @Override
+ public void setCoveringStatistics(TextRulerStatisticsCollector c) {
+ super.setCoveringStatistics(c);
+ if (TextRulerToolkit.DEBUG && c != null) {
+ if (!c.getCoveredPositiveExamples().contains(seedExample)) {
+ TextRulerToolkit.log("ERROR, A WHISK RULE MUST COVER AT LEAST ITS SEED EXAMPLE!");
+ TextRulerToolkit.log("\tRULE: " + getRuleString());
+ }
+ }
+ }
+
+ public boolean containsTerm(WhiskRuleItem term) {
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ for (TextRulerRuleItem i : sp.preFillerPattern)
+ if (i.equals(term))
+ return true;
+ for (TextRulerRuleItem i : sp.fillerPattern)
+ if (i.equals(term))
+ return true;
+ for (TextRulerRuleItem i : sp.postFillerPattern)
+ if (i.equals(term))
+ return true;
+ }
+ return false;
+ }
+
+ public WhiskRuleItem searchItemWithTermNumber(int no) {
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ for (TextRulerRuleItem i : sp.preFillerPattern) {
+ if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+ return (WhiskRuleItem) i;
+ }
+ }
+ for (TextRulerRuleItem i : sp.fillerPattern) {
+ if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+ return (WhiskRuleItem) i;
+ }
+ }
+ for (TextRulerRuleItem i : sp.postFillerPattern) {
+ if (((WhiskRuleItem) i).getTermNumberInExample() == no) {
+ return (WhiskRuleItem) i;
+ }
+ }
+ }
+ return null;
+ }
+
+ // TODO this could be moved to the core framework (TextRulerMultiSlotRule)
+ public WhiskRuleItem searchNeighborOfItem(WhiskRuleItem item, boolean goToLeft) {
+ int slotIndex = -1;
+ int patternIndex = -1;
+ int slotI = 0;
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ for (TextRulerRuleItem it : sp.preFillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 0; // 0=preFiller
+ break;
+ }
+ }
+ if (slotIndex < 0) {
+ for (TextRulerRuleItem it : sp.fillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 1; // 1=filler
+ break;
+ }
+ }
+ }
+ if (slotIndex < 0) {
+ for (TextRulerRuleItem it : sp.postFillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 2; // 2=postFiller
+ break;
+ }
+ }
+ }
+ if (slotIndex >= 0) {
+ break;
+ }
+ }
+ if (slotIndex < 0) // we didn't even find the item in our rule ?! how
+ // can this happen ?
+ return null;
+
+ TextRulerRulePattern currentPattern = getPattern(slotIndex, patternIndex);
+ while (currentPattern != null) {
+ int startIndex = currentPattern.indexOf(item); // this is only >= 0
+ // for the first
+ // pattern...
+ if (!goToLeft) // walk forward...
+ {
+ int startSearchFromIndex = startIndex + 1;
+ if (startSearchFromIndex < currentPattern.size())
+ return (WhiskRuleItem) currentPattern.get(startSearchFromIndex);
+ else // skip to next pattern
+ {
+ patternIndex++;
+ if (patternIndex > 2) {
+ patternIndex = 0;
+ slotIndex++;
+ if (slotIndex >= slotPatterns.size())
+ return null; // not found!
+ }
+ currentPattern = getPattern(slotIndex, patternIndex);
+ }
+ } else {
+ int startSearchFromIndex = startIndex >= 0 ? startIndex - 1 : currentPattern.size() - 1;
+ if (startSearchFromIndex >= 0 && currentPattern.size() > 0)
+ return (WhiskRuleItem) currentPattern.get(startSearchFromIndex);
+ else // skip to previous pattern
+ {
+ patternIndex--;
+ if (patternIndex < 0) {
+ patternIndex = 2;
+ slotIndex--;
+ if (slotIndex < 0)
+ return null; // not found!
+ }
+ currentPattern = getPattern(slotIndex, patternIndex);
+ }
+ }
+ }
+ return null;
+ }
+
+ private TextRulerRulePattern getPattern(int slotIndex, int patternIndex) {
+ TextRulerSlotPattern sp = slotPatterns.get(slotIndex);
+ if (patternIndex == 0)
+ return sp.preFillerPattern;
+ else if (patternIndex == 1)
+ return sp.fillerPattern;
+ else if (patternIndex == 2)
+ return sp.postFillerPattern;
+ else
+ return null;
+ }
+
+ public int totalConstraintPoints() {
+ int result = 0;
+ for (TextRulerSlotPattern sl : slotPatterns) {
+ for (TextRulerRuleItem i : sl.preFillerPattern) {
+ result += ((WhiskRuleItem) i).constraintPoints();
+ }
+ for (TextRulerRuleItem i : sl.fillerPattern) {
+ result += ((WhiskRuleItem) i).constraintPoints();
+ }
+ for (TextRulerRuleItem i : sl.postFillerPattern) {
+ result += ((WhiskRuleItem) i).constraintPoints();
+ }
+ }
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRule.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.Type;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerWordConstraint;
+
+public class WhiskRuleItem implements TextRulerRuleItem {
+
+ private TextRulerWordConstraint wordConstraint;
+
+ private boolean isStarWildCard = false;
+
+ private int termNumberInExample = -1;
+
+ private boolean hideRegExp = false;
+
+ protected List<MLWhiskOtherConstraint> otherConstraints = new ArrayList<MLWhiskOtherConstraint>();
+
+ public static class MLWhiskOtherConstraint {
+
+ TextRulerAnnotation tokenAnnotation;
+
+ TextRulerAnnotation constraintAnnotation;
+
+ boolean canBeAnchor;
+
+ Type type;
+
+ public MLWhiskOtherConstraint(TextRulerAnnotation tokenAnnotation,
+ TextRulerAnnotation constraintAnnotation) {
+ this.tokenAnnotation = tokenAnnotation;
+ this.constraintAnnotation = constraintAnnotation;
+ this.type = constraintAnnotation.getType();
+ canBeAnchor = (tokenAnnotation.getBegin() == constraintAnnotation.getBegin())
+ && (tokenAnnotation.getEnd() == constraintAnnotation.getEnd());
+ // TODO is the matching END also a requirement ?
+ }
+
+ public boolean isTMBasicTypeTokenConstraint() {
+ return tokenAnnotation == constraintAnnotation;
+ }
+
+ public boolean canBeAnchorConstraint() {
+ return canBeAnchor;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ MLWhiskOtherConstraint co = (MLWhiskOtherConstraint) o;
+
+ return toString().equals(co.toString()) && (canBeAnchor == co.canBeAnchor);
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode() * (canBeAnchor ? 2 : 1);
+ }
+
+ @Override
+ public String toString() {
+ return type.getShortName();
+ }
+
+ public MLWhiskOtherConstraint copy() {
+ return new MLWhiskOtherConstraint(tokenAnnotation, constraintAnnotation);
+ }
+
+ }
+
+ public WhiskRuleItem() {
+ super();
+ wordConstraint = null;
+ termNumberInExample = -1;
+ }
+
+ public static WhiskRuleItem newWildCardItem(int startTermNumber) {
+ WhiskRuleItem i = new WhiskRuleItem();
+ i.setIsStarWildCard(true);
+ i.setTermNumberInExample(startTermNumber);
+ return i;
+ }
+
+ public WhiskRuleItem(WhiskRuleItem copyFrom) {
+ super();
+ if (copyFrom.wordConstraint != null)
+ wordConstraint = copyFrom.wordConstraint.copy();
+ isStarWildCard = copyFrom.isStarWildCard;
+ termNumberInExample = copyFrom.termNumberInExample;
+ hideRegExp = copyFrom.hideRegExp;
+ for (MLWhiskOtherConstraint c : copyFrom.otherConstraints)
+ otherConstraints.add(c.copy());
+ }
+
+ public WhiskRuleItem(TextRulerAnnotation tokenAnnotation) {
+ super();
+ setWordConstraint(new TextRulerWordConstraint(tokenAnnotation));
+ }
+
+ public void setWordConstraint(TextRulerWordConstraint c) {
+ wordConstraint = c;
+ }
+
+ public TextRulerWordConstraint getWordConstraint() {
+ return wordConstraint;
+ }
+
+ public TextRulerRuleItem copy() {
+ return new WhiskRuleItem(this);
+ }
+
+ public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+ int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex) {
+
+ String result = "";
+ WhiskRule whiskRule = (WhiskRule) rule;
+ boolean isMarkingItem = type == MLRuleItemType.FILLER && numberInPattern == 0;
+ ArrayList<String> constraints = new ArrayList<String>();
+
+ String anchor = null;
+
+ if (wordConstraint != null) {
+ if (wordConstraint.isRegExpConstraint()) {
+ anchor = wordConstraint.typeShortName();
+ if (!hideRegExp)
+ constraints.add("REGEXP(\"" + wordConstraint + "\")");
+ } else
+ anchor = wordConstraint.toString();
+ }
+
+ MLWhiskOtherConstraint anchorConstraint = null;
+ if (anchor == null) {
+ for (MLWhiskOtherConstraint c : otherConstraints)
+ if (c.canBeAnchorConstraint()) {
+ anchorConstraint = c;
+ break;
+ }
+ }
+
+ for (MLWhiskOtherConstraint oc : otherConstraints) {
+ if (oc != anchorConstraint) {
+ if (oc.canBeAnchorConstraint())
+ constraints.add("IS(" + oc + ")");
+ else
+ constraints.add("PARTOF(" + oc + ")");
+ }
+ }
+ if (anchor == null) {
+ if (anchorConstraint != null)
+ anchor = anchorConstraint.toString();
+ else
+ anchor = "ALL";
+ }
+
+ if (constraints.size() > 0) {
+ String cStr = "";
+ for (String constraintStr : constraints) {
+ if (cStr.length() > 0)
+ cStr += ", ";
+ cStr += constraintStr;
+ }
+ result += "{" + cStr;
+ if (!isMarkingItem)
+ result += "}";
+ }
+
+ if (isMarkingItem) {
+ if (constraints.size() == 0)
+ result += "{";
+ result += "->MARKONCE(" + whiskRule.getMarkName(slotIndex);
+ if (patternSize > 1)
+ result += ", " + (numberInRule + 1) + ", " + (numberInRule + patternSize);
+ result += ")}";
+ }
+ if (isStarWildCard)
+ anchor += "*?";
+ return anchor + result;
+ }
+
+ public void setIsStarWildCard(boolean flag) {
+ isStarWildCard = flag;
+ }
+
+ public boolean isStarWildCard() {
+ return isStarWildCard;
+ }
+
+ public void setTermNumberInExample(int i) {
+ termNumberInExample = i;
+ }
+
+ public int getTermNumberInExample() {
+ return termNumberInExample;
+ }
+
+ public boolean equals(TextRulerRuleItem o) {
+ WhiskRuleItem it = (WhiskRuleItem) o;
+ if (wordConstraint != null)
+ if (!wordConstraint.equals(it.wordConstraint))
+ return false;
+
+ return isStarWildCard == it.isStarWildCard && termNumberInExample == it.termNumberInExample;
+ }
+
+ @Override
+ public String toString() {
+ return getStringForRuleString(null, null, 0, 0, 0, 0, 0);
+ }
+
+ public void setHideRegExp(boolean flag) {
+ hideRegExp = flag;
+ }
+
+ public void addOtherConstraint(MLWhiskOtherConstraint c) {
+ if (!otherConstraints.contains(c))
+ otherConstraints.add(c);
+ }
+
+ public List<MLWhiskOtherConstraint> getOtherConstraints() {
+ return otherConstraints;
+ }
+
+ public int constraintPoints() {
+ int result = 0;
+ if (wordConstraint != null)
+ result += hideRegExp ? 1 : 3; // a regexp constraint is less general
+ // so point it bad here!
+ result += otherConstraints.size();
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskRuleItem.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.whisk.token;
+
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.uima.textmarker.textruler.TextRulerPlugin;
+import org.apache.uima.textmarker.textruler.extension.TextRulerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.eclipse.jface.preference.BooleanFieldEditor;
+import org.eclipse.jface.preference.FieldEditor;
+import org.eclipse.jface.preference.IPreferenceStore;
+import org.eclipse.jface.preference.PreferencePage;
+import org.eclipse.jface.preference.StringFieldEditor;
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Composite;
+import org.eclipse.swt.widgets.Control;
+import org.eclipse.ui.IWorkbench;
+import org.eclipse.ui.IWorkbenchPreferencePage;
+
+public class WhiskTokenPreferencePage extends PreferencePage implements IWorkbenchPreferencePage {
+
+ public static String ID = "org.apache.uima.textmarker.textruler.algorithmPages";
+
+ private TextRulerLearnerController algorithmController;
+
+ private IPreferenceStore store;
+
+ private ArrayList<FieldEditor> fields = new ArrayList<FieldEditor>();
+
+ public WhiskTokenPreferencePage() {
+ TextRulerLearnerController ctrl = TextRulerController
+ .getControllerForID("org.apache.uima.textmarker.textruler.whisk.token");
+ this.algorithmController = ctrl;
+ store = TextRulerPlugin.getDefault().getPreferenceStore();
+ setPreferenceStore(store);
+ }
+
+ @Override
+ protected Control createContents(Composite parent) {
+ Composite top = new Composite(parent, SWT.LEFT);
+ top.setLayoutData(new GridData(GridData.FILL_HORIZONTAL));
+ top.setLayout(new GridLayout());
+
+ TextRulerLearnerFactory f = algorithmController.getFactory();
+ TextRulerLearnerParameter[] params = f.getAlgorithmParameters();
+ Map<String, Object> values = f.getAlgorithmParameterStandardValues();
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ TextRulerLearnerParameter p = params[i];
+ String id = algorithmController.getID() + "." + p.id;
+ FieldEditor l = null;
+ switch (p.type) {
+ case ML_BOOL_PARAM: {
+ l = new BooleanFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, (Boolean) values.get(p.id));
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+
+ case ML_FLOAT_PARAM:
+ case ML_INT_PARAM:
+ case ML_STRING_PARAM: {
+ l = new StringFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, values.get(p.id).toString());
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+ case ML_SELECT_PARAM:
+ break;
+ }
+ }
+ }
+ return top;
+ }
+
+ @Override
+ public void init(IWorkbench workbench) {
+ }
+
+ @Override
+ protected void performDefaults() {
+ for (FieldEditor f : fields)
+ f.loadDefault();
+ // super.performDefaults();
+ }
+
+ @Override
+ public boolean performOk() {
+ for (FieldEditor f : fields)
+ f.store();
+ // return super.performOk();
+ return true;
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/whisk/token/WhiskTokenPreferencePage.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,730 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.wien;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+
+public class Wien extends TextRulerBasicLearner {
+
+ TextRulerRulePattern hPattern;
+
+ TextRulerRulePattern tPattern;
+
+ Map<String, PatternPair> headTailCache = new HashMap<String, PatternPair>();
+
+ Map<String, List<TextRulerRulePattern>> interTupelSeparatorsCache = new HashMap<String, List<TextRulerRulePattern>>();
+
+ public static class PatternPair {
+ public TextRulerRulePattern l = new TextRulerRulePattern();
+
+ public TextRulerRulePattern r = new TextRulerRulePattern();
+ }
+
+ ArrayList<PatternPair> patternPairs = new ArrayList<PatternPair>();
+
+ WienRule theRule;
+
+ public enum constraint3ReturnType {
+ C3_SUCCESS, C3_L1CandidateSuffixError, C3_TailCandidateH_L1Error, C3_TailCandidateRK_PrefixError, C3_TailCandidateNotFoundError, C3_TailCandidateSucceedsL1InTailError, C3_L1CandidateInterTupleSeparatorSuffixError, C3_TailCandidatePrecedesL1InterTupleSeparatorError
+ };
+
+ public Wien(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, delegate);
+ }
+
+ @Override
+ public boolean collectNegativeCoveredInstancesWhenTesting() {
+ return false;
+ }
+
+ @Override
+ protected void doRun() {
+ TextRulerToolkit.log("-- WIEN START");
+
+ headTailCache.clear();
+ interTupelSeparatorsCache.clear();
+
+ for (int i = 0; i < slotNames.length; i++)
+ patternPairs.add(new PatternPair());
+
+ TextRulerTarget target = new TextRulerTarget(slotNames, this);
+
+ exampleDocuments.createExamplesForTarget(target); // new multislot
+ // target examples
+
+ for (TextRulerExample e : exampleDocuments.getAllPositiveExamples()) {
+ TextRulerToolkit.log("Example found: " + e);
+ }
+
+ try {
+ boolean allOk = true;
+ sendStatusUpdateToDelegate("Searching for right patterns...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ if (!findRightPatterns())
+ allOk = false;
+ sendStatusUpdateToDelegate("Searching for left patterns...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ if (!findLeftPatterns())
+ allOk = false;
+ sendStatusUpdateToDelegate("Searching for head, tail and left1 patterns...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ if (!findHeadTailAndL1Patterns())
+ allOk = false;
+ // {
+ // String s = "";
+ // for (TextRulerRuleItem i : hPattern)
+ // s += " "+i;
+ // s += " ||||";
+ // for (TextRulerRuleItem i : patternPairs.get(0).l)
+ // s += " "+i;
+ // s += " ||||";
+ // for (TextRulerRuleItem i : tPattern)
+ // s += " "+i;
+ // TextRulerToolkit.log(s);
+ // }
+
+ if (allOk) {
+ sendStatusUpdateToDelegate("Building multi-slot rule.", TextRulerLearnerState.ML_RUNNING,
+ false);
+ theRule = new WienRule(this, target);
+ List<TextRulerSlotPattern> rPatterns = theRule.getPatterns();
+ int totalItemCount = 0;
+ for (int k = 0; k < slotNames.length; k++) {
+ WienRuleItem slotItem = new WienRuleItem((TextRulerAnnotation) null);
+ TextRulerSlotPattern rP = new TextRulerSlotPattern();
+ rPatterns.add(rP);
+ PatternPair p = patternPairs.get(k);
+ for (int i = 0; i < p.l.size(); i++) {
+ WienRuleItem item = (WienRuleItem) p.l.get(i);
+ if (k == 0 && i == 0) // the very first rule item:
+ {
+ item = item.copy();
+ // old version:
+ // item.addCondition("-NEAR,wien_tail,10000000,false");
+ item.addCondition("-AFTER(wien_tail)");
+ item.addCondition("-PARTOF(wien_rulemark)");
+ }
+ rP.preFillerPattern.add(item);
+ totalItemCount++;
+ }
+ rP.fillerPattern.add(slotItem.copy());
+ totalItemCount++;
+ for (int i = 0; i < p.r.size(); i++) {
+ WienRuleItem item = (WienRuleItem) p.r.get(i);
+ totalItemCount++;
+ if (k == slotNames.length - 1 && i == p.r.size() - 1) // the
+ // very
+ // last
+ // item
+ {
+ item = item.copy();
+ item.addAction("MARK(wien_rulemark, 1, " + totalItemCount + ")");
+ }
+ rP.postFillerPattern.add(item);
+ }
+ totalItemCount++; // the inter-slot ALL*? item has to be
+ // counted as well!
+ }
+ sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+ } else
+ sendStatusUpdateToDelegate("Done - Not all patterns could be learned!",
+ TextRulerLearnerState.ML_DONE, true);
+ } catch (Exception e) {
+ e.printStackTrace();
+ sendStatusUpdateToDelegate("Aborted due to Exception!", TextRulerLearnerState.ML_ERROR, true);
+ }
+ headTailCache.clear();
+ interTupelSeparatorsCache.clear();
+ TextRulerToolkit.log("-- WIEN END");
+ }
+
+ protected boolean findRightPatterns() {
+ TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
+ boolean allFound = true;
+ for (int k = 0; k < slotNames.length; k++) {
+ List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
+ System.out.println(rightContexts.get(0));
+ int shortest = Integer.MAX_VALUE;
+ for (TextRulerRulePattern p : rightContexts)
+ shortest = p.size() < shortest ? p.size() : shortest;
+ boolean found = false;
+ for (int len = 1; len <= shortest; len++) {
+ TextRulerRulePattern subPattern = rightContexts.get(0).subPattern(0, len);
+ if (testConstraint1(subPattern, k)) {
+ // for (TextRulerRuleItem i : subPattern)
+ // ((WienRuleItem)i).getWordConstraint().setGeneralizeLinkMarkUp(true);
+ patternPairs.get(k).r = subPattern;
+ TextRulerToolkit.log("right " + k + ": " + subPattern);
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ allFound = false;
+ }
+ return allFound;
+ }
+
+ protected boolean findLeftPatterns() {
+ TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
+ // skip l 0 !
+ if (slotNames.length < 2)
+ return true;
+ boolean allFound = true;
+ for (int k = 1; k < slotNames.length; k++) {
+ List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
+ int shortest = Integer.MAX_VALUE;
+ for (TextRulerRulePattern p : leftContexts)
+ shortest = p.size() < shortest ? p.size() : shortest;
+ TextRulerRulePattern sourcePattern = leftContexts.get(0);
+ boolean found = false;
+ for (int len = 1; len <= shortest; len++) {
+ // get suffix:
+ TextRulerRulePattern subPattern = sourcePattern.subPattern(sourcePattern.size() - len, len);
+ if (testConstraint2(subPattern, k)) {
+ patternPairs.get(k).l = subPattern;
+ for (TextRulerRuleItem i : subPattern)
+ ((WienRuleItem) i).getWordConstraint().setGeneralizeLinkMarkUp(true);
+ TextRulerToolkit.log("left " + k + ": " + subPattern);
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ allFound = false;
+ }
+ return allFound;
+ }
+
+ protected boolean findHeadTailAndL1Patterns() {
+ List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
+ TextRulerExampleDocument doc0 = docs.get(0);
+ TextRulerRulePattern head = new TextRulerRulePattern();
+ TextRulerRulePattern tail = new TextRulerRulePattern();
+ getPageHeadAndTailPortion(doc0, head, tail);
+
+ final class HLCandidate {
+ public TextRulerRulePattern head = new TextRulerRulePattern();
+
+ public TextRulerRulePattern l1 = new TextRulerRulePattern();
+ }
+
+ // a small optimization:
+ // find out the maximum possible length for l1 in doc0 since l1 is much
+ // smaller than the possible head length!
+ List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
+ int shortestL1 = head.size() - 1;
+ for (TextRulerRulePattern its : interTupleSeparators)
+ shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;
+
+ List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
+ // create candidates for each separation of the head and tail patterns:
+ for (int separator = head.size() - 1; separator > 0; separator--) {
+ HLCandidate c = new HLCandidate();
+ for (int i = 0; i < head.size(); i++) {
+ if (i < separator)
+ c.head.add(head.get(i));
+ else {
+ WienRuleItem it = (WienRuleItem) head.get(i).copy();
+ it.getWordConstraint().setGeneralizeLinkMarkUp(true);
+ c.l1.add(it);
+ }
+ }
+ hlCandidates.add(c);
+ TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
+ if (c.l1.size() >= shortestL1)
+ break;
+ }
+
+ long total = 0;
+
+ // get total h l1 t combination count:
+ long tCand = (tail.size() * (tail.size() + 1)) / 2;
+ for (HLCandidate c : hlCandidates) {
+ total += ((c.head.size() - 1) * (c.head.size())) / 2;
+ }
+ total *= tCand;
+
+ long current = 0;
+ int oldPercent = -1;
+
+ for (HLCandidate c : hlCandidates) {
+ // for each "candidate" which represents a l1 suffix pattern of the
+ // head tokens and a rest pattern for the h pattern,
+ // we have to create every sub pattern of the remaining h pattern as
+ // a h candidate:
+ TextRulerRulePattern l1 = c.l1;
+ TextRulerRulePattern h = null;
+
+ boolean l1Sucks = false;
+
+ for (int endI = c.head.size() - 1; endI > 0; endI--) {
+ for (int startI = endI; startI > 0; startI--) {
+ h = new TextRulerRulePattern();
+ for (int i = startI; i <= endI; i++)
+ h.add(c.head.get(i));
+
+ // now for each h candidate we have to create each t
+ // candidate:
+ TextRulerRulePattern t = null;
+ for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
+ for (int tendI = tstartI; tendI < tail.size(); tendI++) {
+ int percent = Math.round(((float) current * 100 / total));
+ if (percent != oldPercent) {
+ oldPercent = percent;
+ if (percent > 100)
+ percent = 100;
+ // TextRulerToolkit.log(current+" / "+total);
+ sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
+ TextRulerLearnerState.ML_RUNNING, false);
+ }
+ if (shouldAbort())
+ return false;
+ current++;
+
+ t = new TextRulerRulePattern();
+ for (int i = tstartI; i <= tendI; i++)
+ t.add(tail.get(i));
+
+ // no we have a possible candidate triple: h, t and
+ // l1:
+
+ constraint3ReturnType c3Result = testConstraint3(h, t, l1);
+
+ if (c3Result == constraint3ReturnType.C3_SUCCESS) {
+ hPattern = h;
+ tPattern = t;
+ patternPairs.get(0).l = l1;
+ return true;
+ } else if (c3Result == constraint3ReturnType.C3_L1CandidateSuffixError
+ || c3Result == constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError) {
+ l1Sucks = true;
+ current += tail.size() - tendI - 1;
+ break;
+ } else if (c3Result == constraint3ReturnType.C3_TailCandidateH_L1Error
+ || c3Result == constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError) {
+ // no special pruning options here... we simply
+ // have to test the next t-candidate
+ } else if (c3Result == constraint3ReturnType.C3_TailCandidateRK_PrefixError
+ || c3Result == constraint3ReturnType.C3_TailCandidateNotFoundError) {
+ // all candidates with the same start item are
+ // bad, so leave this inner loop:
+ current += tail.size() - tendI - 1;
+ break;
+ } else if (c3Result == constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError) {
+ // this is a problematic case... the cause could
+ // be L1 or the current Tail pattern,
+ // so we can't do nothing about it! just try the
+ // next t-candidate
+ }
+ }
+ if (l1Sucks) {
+ current += (tail.size() - tstartI - 1) * (tail.size() - tstartI) / 2;
+ break;
+ }
+ }
+ if (l1Sucks) {
+ if (startI > 0)
+ current += (startI - 1) * tCand;
+ break;
+ }
+ }
+ if (l1Sucks) {
+ current += (endI * (endI + 1) / 2) * tCand;
+ break;
+ }
+ }
+ }
+ return false;
+ }
+
+ protected void getPageHeadAndTailPortion(TextRulerExampleDocument doc, TextRulerRulePattern head,
+ TextRulerRulePattern tail) {
+ String key = doc.getCasFileName();
+ if (headTailCache.containsKey(key)) {
+ PatternPair p = headTailCache.get(key);
+ head.addAll(p.l);
+ tail.addAll(p.r);
+ } else {
+ CAS cas = doc.getCAS();
+ TextRulerExample firstExample = doc.getPositiveExamples().get(0);
+ TextRulerExample lastExample = doc.getPositiveExamples().get(
+ doc.getPositiveExamples().size() - 1);
+ TypeSystem ts = cas.getTypeSystem();
+ Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+ List<AnnotationFS> headTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas,
+ firstExample.getAnnotations()[0].getBegin(), 0, TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+ TextRulerAnnotation[] lastExampleAnnotations = lastExample.getAnnotations();
+ List<AnnotationFS> tailTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas,
+ lastExampleAnnotations[lastExampleAnnotations.length - 1].getEnd(), 0,
+ TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+ for (AnnotationFS afs : headTokens)
+ head.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+ for (AnnotationFS afs : tailTokens)
+ tail.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+ PatternPair p = new PatternPair();
+ p.l.addAll(head);
+ p.r.addAll(tail);
+ headTailCache.put(key, p);
+ }
+ }
+
+ protected List<TextRulerRulePattern> getInterTupleSepatators(TextRulerExampleDocument doc) {
+ String key = doc.getCasFileName();
+ if (interTupelSeparatorsCache.containsKey(key)) {
+ return interTupelSeparatorsCache.get(key);
+ } else {
+ List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+ CAS cas = doc.getCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+ List<TextRulerExample> examples = doc.getPositiveExamples();
+ for (int i = 0; i < examples.size() - 1; i++) {
+ // get separator between i'th and (i+1)'th example:
+ TextRulerAnnotation[] exampleAnnotations1 = examples.get(i).getAnnotations();
+ TextRulerAnnotation[] exampleAnnotations2 = examples.get(i + 1).getAnnotations();
+ TextRulerAnnotation lastOf1 = exampleAnnotations1[exampleAnnotations1.length - 1];
+ TextRulerAnnotation firstOf2 = exampleAnnotations2[0];
+ List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, lastOf1
+ .getEnd(), firstOf2.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
+ slotNames, filterSet), tokenType);
+ TextRulerRulePattern thePattern = new TextRulerRulePattern();
+ for (AnnotationFS afs : theTokens)
+ thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+ if (thePattern.size() > 0)
+ result.add(thePattern);
+
+ }
+ interTupelSeparatorsCache.put(key, result);
+ return result;
+ }
+ }
+
+ protected List<TextRulerRulePattern> getRightContextForSlot(TextRulerExampleDocument doc,
+ int slotIndex) {
+ List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+ CAS cas = doc.getCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+ List<TextRulerExample> examples = doc.getPositiveExamples();
+ boolean isLastSlot = slotIndex >= slotNames.length - 1;
+ for (int ei = 0; ei < examples.size(); ei++) {
+ boolean isLastExample = ei == examples.size() - 1;
+ TextRulerExample e = examples.get(ei);
+ // get stuff between slot slotIndex and slotIndex+1
+ TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+ TextRulerAnnotation nextSlotAnnotation;
+
+ if (!isLastSlot)
+ nextSlotAnnotation = e.getAnnotations()[slotIndex + 1];
+ else {
+ if (!isLastExample) // the next slot annotation is the first
+ // example annotation of the next template:
+ nextSlotAnnotation = examples.get(ei + 1).getAnnotations()[0];
+ else
+ nextSlotAnnotation = null;
+ }
+
+ List<AnnotationFS> theTokens;
+ if (nextSlotAnnotation == null)
+ theTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas, slotAnnotation.getEnd(), 0,
+ TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+ else
+ theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, slotAnnotation.getEnd(),
+ nextSlotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
+ slotNames, filterSet), tokenType);
+ TextRulerRulePattern thePattern = new TextRulerRulePattern();
+ for (AnnotationFS afs : theTokens)
+ thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+ if (thePattern.size() > 0)
+ result.add(thePattern);
+ }
+ return result;
+ }
+
+ protected List<TextRulerRulePattern> getLeftContextForSlot(TextRulerExampleDocument doc,
+ int slotIndex) {
+ if (slotIndex == 0)
+ return null;
+ List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+ CAS cas = doc.getCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+ List<TextRulerExample> examples = doc.getPositiveExamples();
+
+ boolean isFirstSlot = slotIndex == 0;
+ for (int ei = 0; ei < examples.size(); ei++) {
+ boolean isFirstExample = ei == 0;
+ TextRulerExample e = examples.get(ei);
+ // get stuff between slot slotIndex and slotIndex+1
+ TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+ TextRulerAnnotation prevSlotAnnotation;
+
+ if (!isFirstSlot)
+ prevSlotAnnotation = e.getAnnotations()[slotIndex - 1];
+ else {
+ if (!isFirstExample)
+ prevSlotAnnotation = examples.get(ei - 1).getAnnotations()[slotNames.length - 1];
+ else
+ prevSlotAnnotation = null;
+ }
+
+ List<AnnotationFS> theTokens;
+ if (prevSlotAnnotation == null)
+ theTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas, slotAnnotation.getBegin(),
+ 0, TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+ else
+ theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, prevSlotAnnotation.getEnd(),
+ slotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+ filterSet), tokenType);
+ TextRulerRulePattern thePattern = new TextRulerRulePattern();
+ for (AnnotationFS afs : theTokens)
+ thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc), true));
+ if (thePattern.size() > 0)
+ result.add(thePattern);
+ }
+ return result;
+ }
+
+ protected List<TextRulerRulePattern> getSlotFillerPatterns(TextRulerExampleDocument doc,
+ int slotIndex) {
+ List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+ CAS cas = doc.getCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type tokenType = ts.getType(TextRulerToolkit.TM_ALL_TYPE_NAME);
+ List<TextRulerExample> examples = doc.getPositiveExamples();
+ for (TextRulerExample e : examples) {
+ TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
+ List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas,
+ slotAnnotation.getBegin(), slotAnnotation.getEnd(), TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
+ TextRulerRulePattern thePattern = new TextRulerRulePattern();
+ for (AnnotationFS afs : theTokens)
+ thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
+ if (thePattern.size() > 0)
+ result.add(thePattern);
+ }
+ return result;
+ }
+
+ protected constraint3ReturnType testConstraint3(TextRulerRulePattern h, TextRulerRulePattern t,
+ TextRulerRulePattern l1) {
+ for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+ constraint3ReturnType r = testConstraint3(doc, h, t, l1);
+ if (r != constraint3ReturnType.C3_SUCCESS)
+ return r;
+ }
+ return constraint3ReturnType.C3_SUCCESS;
+ }
+
+ protected boolean testConstraint1(TextRulerExampleDocument doc, TextRulerRulePattern rk, int k) {
+ List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
+ for (TextRulerRulePattern rx : rightContexts) {
+ if (rx.find(rk) != 0)
+ return false;
+ }
+ List<TextRulerRulePattern> contents = getSlotFillerPatterns(doc, k);
+ for (TextRulerRulePattern c : contents) {
+ if (c.find(rk) >= 0)
+ return false;
+ }
+
+ return true;
+ }
+
+ protected boolean testConstraint1(TextRulerRulePattern rk, int k) {
+ for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+ if (!testConstraint1(doc, rk, k))
+ return false;
+ }
+ return true;
+ }
+
+ protected boolean testConstraint2(TextRulerExampleDocument doc, TextRulerRulePattern lk, int k) {
+ List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
+ for (TextRulerRulePattern lx : leftContexts) {
+ if (lx.size() < lk.size())
+ return false;
+ int pos = lx.find(lk);
+ if (pos < 0 || pos != lx.size() - lk.size())
+ return false;
+ }
+ return true;
+ }
+
+ protected boolean testConstraint2(TextRulerRulePattern lk, int k) {
+ for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+ if (!testConstraint2(doc, lk, k))
+ return false;
+ }
+ return true;
+ }
+
+ protected constraint3ReturnType testConstraint3(TextRulerExampleDocument doc,
+ TextRulerRulePattern h, TextRulerRulePattern t, TextRulerRulePattern l1) {
+ final boolean logReasons = false;
+
+ TextRulerRulePattern head = new TextRulerRulePattern();
+ TextRulerRulePattern tail = new TextRulerRulePattern();
+
+ getPageHeadAndTailPortion(doc, head, tail);
+
+ // 1: l1 must be a proper suffix of the portion between the end of h and
+ // the first slot filler:
+ // (head / h) / l1 = l1
+
+ int hPos = head.find(h);
+
+ // TOOD precalculate this outside this method ?
+ TextRulerRulePattern restForL1 = head.subPattern(hPos + h.size(), -1).copy();
+ for (TextRulerRuleItem it : restForL1)
+ ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
+ int l1Pos = restForL1.find(l1);
+ if (l1Pos < 0 || l1Pos != restForL1.size() - l1.size()) {
+ TextRulerToolkit.logIf(logReasons, "REASON 1\n\tl1 \t" + l1 + "\n\trestforl1\t"
+ + restForL1);
+ return constraint3ReturnType.C3_L1CandidateSuffixError;
+ }
+
+ // 2: t must not occur in the subpattern after h and before l1
+ if (l1Pos > 0) {
+ TextRulerRulePattern patternBetweenHandL1 = restForL1.subPattern(0, l1Pos);
+ if (patternBetweenHandL1.size() >= t.size()) {
+ if (patternBetweenHandL1.find(t) >= 0) {
+ TextRulerToolkit.logIf(logReasons, "REASON 2");
+ return constraint3ReturnType.C3_TailCandidateH_L1Error;
+ }
+ }
+ }
+
+ // 2a: addons, not specified in WIEN paper !!
+ TextRulerRulePattern lastSlotRightPattern = patternPairs.get(slotNames.length - 1).r;
+ if (t.find(lastSlotRightPattern) == 0) // the right boundary of the last
+ // slot may not be part of the
+ // tail pattern!
+ {
+ TextRulerToolkit.logIf(logReasons, "REASON 3: " + lastSlotRightPattern + "\tTail: " + t);
+ return constraint3ReturnType.C3_TailCandidateRK_PrefixError;
+ }
+
+ int tPos = tail.find(t);
+ if (tPos < 0) {
+ TextRulerToolkit.logIf(logReasons, "REASON 4");
+ return constraint3ReturnType.C3_TailCandidateNotFoundError;
+ } // this is an own constraint definition: if a document does not have
+ // the tail in it,
+ // what should we do then ? is this a n error or is this okay since the
+ // document may not have any tail after the data ?
+
+ // 3: l1 must not precede t in the page's tail:
+ int l1tPos = tail.find(l1);
+ if (l1tPos >= 0) // l1 occurs in the page's tail:
+ {
+ if (l1tPos < tPos) {
+ TextRulerToolkit.logIf(logReasons, "REASON 5");
+ return constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError;
+ }
+ }
+
+ List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc);
+
+ for (TextRulerRulePattern itSep : interTupleSeparators) {
+ // 4: l1 must be a proper suffix of each of the inter-tuple
+ // separators:
+ TextRulerRulePattern itSepCopy = itSep.copy();
+ for (TextRulerRuleItem it : itSepCopy)
+ ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
+ int l1itsPos = itSepCopy.find(l1);
+ if (l1itsPos < 0 || l1itsPos != itSepCopy.size() - l1.size()) {
+ TextRulerToolkit.logIf(logReasons, "REASON 6: \n\tl1\t" + l1 + "\n\titSep\t" + itSep);
+ return constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError;
+ }
+
+ // 5: t must never precede l1 in any inter-tuple separator:
+ int itstPos = itSep.find(t);
+ if (itstPos >= 0 && itstPos < l1itsPos) {
+ TextRulerToolkit.logIf(logReasons, "REASON 7");
+ return constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError;
+ }
+
+ }
+ return constraint3ReturnType.C3_SUCCESS;
+ }
+
+ public String getResultString() {
+ if (theRule == null)
+ return "<no results yet>";
+ String result = getTMFileHeaderString() + "DECLARE wien_tail;\n" + "DECLARE wien_rulemark;\n"
+ + "DECLARE wien_content;\n" + "BOOLEAN wien_redo;\n\n"
+ + "// tail/head/content area stuff:\n";
+
+ TextRulerRulePattern hCopy = hPattern.copy();
+
+ ((WienRuleItem) hCopy.get(0)).addCondition("-PARTOF(wien_content)");
+ result += hCopy + " ALL*?{->MARK(wien_content)};\n";
+
+ TextRulerRulePattern tCopy = tPattern.copy();
+ ((WienRuleItem) tCopy.get(0)).addCondition("PARTOF(wien_content)");
+
+ result += tCopy + "{->MARK(wien_tail";
+ if (tPattern.size() > 1)
+ result += ", 1, " + tPattern.size();
+ result += ")};\n\n";
+
+ result += "BLOCK(findData) wien_content {\n"
+ + "\t// find out if tail is before the next occurence of l1\n"
+ + "\t"
+ + theRule.getRuleString()
+ + "\n"
+ + "\tDocument{->ASSIGN(wien_redo, false)};\n"
+ + "\twien_tail{PARTOF(wien_rulemark)->UNMARK(wien_tail), ASSIGN(wien_redo, true)}; // remove tail marks that are no longer relevant for us after the last rule !\n"
+ + "\tDocument{IF(wien_redo)->CALL(filename.findData)};\n" + "}\n";
+
+ result += "\n// cleaning up:\n" + "wien_tail{->UNMARK(wien_tail)};\n"
+ + "wien_rulemark{->UNMARK(wien_rulemark)};\n"
+ + "wien_content{->UNMARK(wien_content)};\n";
+ return result;
+ }
+
+ public void setParameters(Map<String, Object> params) {
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/wien/Wien.java
------------------------------------------------------------------------------
svn:mime-type = text/plain