You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2013/06/10 10:05:56 UTC
svn commit: r1491365 - in /uima/sandbox/ruta/trunk/ruta-ep-textruler: ./
src/main/java/org/apache/uima/ruta/textruler/learner/kep/
Author: pkluegl
Date: Mon Jun 10 08:05:55 2013
New Revision: 1491365
URL: http://svn.apache.org/r1491365
Log:
UIMA-2860
- inital import of kep project to textruler project
Added:
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPFactory.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPLearner.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPPreferencePage.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRule.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItem.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItemCondition.java
Modified:
uima/sandbox/ruta/trunk/ruta-ep-textruler/plugin.xml
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/plugin.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/plugin.xml?rev=1491365&r1=1491364&r2=1491365&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/plugin.xml (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/plugin.xml Mon Jun 10 08:05:55 2013
@@ -233,6 +233,23 @@ under the License.
</page>
</extension>
<extension
+ point="org.eclipse.ui.preferencePages">
+ <page
+ category="org.apache.uima.ruta.textruler"
+ class="org.apache.uima.ruta.textruler.learner.kep.KEPPreferencePage"
+ id="org.apache.uima.ruta.textruler.kep"
+ name="KEP">
+ </page>
+ </extension>
+ <extension
+ point="org.apache.uima.ruta.textruler.learners">
+ <learner
+ class="org.apache.uima.ruta.textruler.learner.kep.KEPFactory"
+ id="org.apache.uima.ruta.textruler.kep"
+ name="KEP">
+ </learner>
+ </extension>
+ <extension
point="org.eclipse.core.runtime.preferences">
<initializer
class="org.apache.uima.ruta.textruler.preferences.TextRulerPreferenceInitializer">
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPFactory.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPFactory.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPFactory.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPFactory.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.ruta.textruler.extension.TextRulerLearner;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerParameter;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerParameter.MLAlgorithmParamType;
+
+public class KEPFactory implements TextRulerLearnerFactory {
+
+ public KEPFactory() {
+ }
+
+ public TextRulerLearner createAlgorithm(String inputFolderPath, String additionalFolderPath,
+ String preprocessorTMfile, String tempFolderPath, String[] fullSlotTypeNames,
+ Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
+ return new KEPLearner(inputFolderPath, preprocessorTMfile, tempFolderPath, fullSlotTypeNames,
+ filterSet, skip, delegate);
+ }
+
+ public Map<String, Object> getAlgorithmParameterStandardValues() {
+ Map<String, Object> result = new HashMap<String, Object>();
+// result.put(KEPLearner.FILLER_WINDOW, KEPLearner.DEFAULT_FILLER_WINDOW);
+// result.put(KEPLearner.MAX_FILLER_LENGTH, KEPLearner.DEFAULT_MAX_FILLER_LENGTH);
+ result.put(KEPLearner.MAX_EXPAND_RULES, KEPLearner.DEFAULT_MAX_EXPAND_RULES);
+ result.put(KEPLearner.MAX_INFILLER_RULES, KEPLearner.DEFAULT_MAX_INFILLER_RULES);
+ return result;
+ }
+
+ public TextRulerLearnerParameter[] getAlgorithmParameters() {
+ TextRulerLearnerParameter[] result = new TextRulerLearnerParameter[2];
+// result[0] = new TextRulerLearnerParameter(KEPLearner.FILLER_WINDOW,
+// "fillerWindow", MLAlgorithmParamType.ML_INT_PARAM);
+// result[1] = new TextRulerLearnerParameter(KEPLearner.MAX_FILLER_LENGTH,
+// "maxFillerLength", MLAlgorithmParamType.ML_INT_PARAM);
+ result[0] = new TextRulerLearnerParameter(KEPLearner.MAX_EXPAND_RULES,
+ "maxExpandRules", MLAlgorithmParamType.ML_INT_PARAM);
+ result[1] = new TextRulerLearnerParameter(KEPLearner.MAX_INFILLER_RULES,
+ "maxInfillerRules", MLAlgorithmParamType.ML_INT_PARAM);
+ return result;
+ }
+
+}
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPLearner.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPLearner.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPLearner.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPLearner.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,1173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.ConstraintFactory;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.FSMatchConstraint;
+import org.apache.uima.cas.FSTypeConstraint;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.ruta.textruler.core.TextRulerExample;
+import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
+import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
+import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.ruta.textruler.core.TextRulerTarget;
+import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.ruta.textruler.learner.kep.KEPRuleItemCondition.Condition;
+
+public class KEPLearner extends TextRulerBasicLearner {
+
+ public static final String MAX_EXPAND_RULES = "maxExpandRules";
+
+ public static final String MAX_INFILLER_RULES = "maxInfillerRules";
+
+ public static final String FILLER_WINDOW = "fillerWindow";
+
+ public static final String MAX_FILLER_LENGTH = "maxFillerLength";
+
+ public static final int DEFAULT_MAX_EXPAND_RULES = 50;
+
+ public static final int DEFAULT_MAX_INFILLER_RULES = 10;
+
+ public static final int DEFAULT_FILLER_WINDOW = 5;
+
+ public static final int DEFAULT_MAX_FILLER_LENGTH = 3;
+
+ private int fillerWindow;
+
+ private int maxFillerLength;
+
+ private int maxInfillerRules;
+
+ private int maxExpandRules;
+
+ private Map<String, List<KEPRule>> ruleLists = new HashMap<String, List<KEPRule>>();
+
+ private Map<String, List<KEPRule>> correctionRules = new HashMap<String, List<KEPRule>>();
+
+ private Map<String, List<TextRulerExample>> coveredExamples = new HashMap<String, List<TextRulerExample>>();
+
+ private Map<String, Type> blocks = new HashMap<String, Type>();
+
+ private String[] slotNamesWithBoundaries;
+
+ private Map<String, Boolean> hasPerfectRules = new HashMap<String, Boolean>();
+
+ public KEPLearner(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
+ }
+
+ @Override
+ protected void doRun() {
+
+ long startTime = System.nanoTime();
+
+ this.exampleDocuments.clearCurrentExamples();
+ prepareCachedCASesWithBoundaries();
+ this.slotNamesWithBoundaries = new String[slotNames.length * 3];
+ for (int i = 0; i < this.slotNames.length; i++) {
+ this.slotNamesWithBoundaries[i * 3] = slotNames[i];
+ this.slotNamesWithBoundaries[i * 3 + 1] = slotNames[i]
+ + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION;
+ this.slotNamesWithBoundaries[i * 3 + 2] = slotNames[i]
+ + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION;
+
+ }
+ for (int i = 0; i < this.slotNamesWithBoundaries.length; i++) {
+ if (!filterSetWithSlotNames.contains(slotNamesWithBoundaries[i]))
+ this.filterSetWithSlotNames.add(slotNamesWithBoundaries[i]);
+ initializeMapEntries(this.slotNamesWithBoundaries[i]);
+ }
+ for (int i = 0; i < this.slotNamesWithBoundaries.length; i++) {
+ runForSlot(this.slotNamesWithBoundaries[i]);
+ if (slotNamesWithBoundaries[i].contains(TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION)) {
+ if (hasPerfectRules.get(slotNamesWithBoundaries[i - 2]))
+ filterSetWithSlotNames.remove(slotNamesWithBoundaries[i - 2]);
+ if (hasPerfectRules.get(slotNamesWithBoundaries[i - 1]))
+ filterSetWithSlotNames.remove(slotNamesWithBoundaries[i - 1]);
+ if (hasPerfectRules.get(slotNamesWithBoundaries[i]))
+ filterSetWithSlotNames.remove(slotNamesWithBoundaries[i]);
+ }
+ if (shouldAbort())
+ return;
+ }
+ removeBadRules();
+ for (int i = 0; i < this.slotNamesWithBoundaries.length; i++) {
+ List<KEPRule> list = this.ruleLists.get(slotNamesWithBoundaries[i]);
+ if (!shouldAbort() && list != null && !list.isEmpty()) {
+ this.exampleDocuments.createExamplesForTarget(list.get(0).getTarget());
+ if (!hasPerfectRules.get(slotNamesWithBoundaries[i]))
+ makeRemovalRules(list.get(0).getTarget());
+ list = getOptimalRuleCombination(list);
+ }
+ }
+ removeBadRules();
+
+ long estimatedTime = (System.nanoTime() - startTime) / 1000000000;
+ System.out.println(estimatedTime + " seconds needed to learn all rules");
+ sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+
+ }
+
+ /**
+ * Execute algorithms for slot denoted by slotName
+ *
+ * @param slotName
+ * the name of a slot
+ */
+ private void runForSlot(String slotName) {
+
+ sendStatusUpdateToDelegate("Working on " + slotName, TextRulerLearnerState.ML_RUNNING, true);
+ TextRulerTarget target = new TextRulerTarget(slotName,
+ TextRulerTarget.MLTargetType.SINGLE_WHOLE_SLOT, this);
+ this.exampleDocuments.createExamplesForTarget(target);
+ if (!shouldAbort())
+ blocks.put(slotName, getBlocks());
+ if (!shouldAbort())
+ learnRules(target);
+ this.ruleLists.put(slotName, getOptimalRuleCombination(this.ruleLists.get(slotName)));
+ sendStatusUpdateToDelegate(slotName + " done", TextRulerLearnerState.ML_RUNNING, true);
+ }
+
+ private Type getBlocks() {
+ sendStatusUpdateToDelegate("Searching for Blocks", TextRulerLearnerState.ML_RUNNING, false);
+ Map<String, List<TextRulerExample>> exampleMap = new HashMap<String, List<TextRulerExample>>();
+ Map<String, Double> lengthMap = new HashMap<String, Double>();
+ Map<String, Integer> countMap = new HashMap<String, Integer>();
+ for (TextRulerExampleDocument exampleDocument : exampleDocuments.getDocuments()) {
+ for (AnnotationFS annotation : exampleDocument.getCAS().getAnnotationIndex()) {
+ for (TextRulerExample example : exampleDocument.getPositiveExamples()) {
+ if (annotation.getBegin() <= example.getAnnotation().getBegin()
+ && annotation.getEnd() >= example.getAnnotation().getEnd()
+ && annotation.getEnd() - annotation.getBegin() > example.getAnnotation().getEnd()
+ - example.getAnnotation().getBegin()
+ && !filterSetWithSlotNames.contains(annotation.getType().getName())) {
+ List<TextRulerExample> list = exampleMap.get(annotation.getType().getName());
+ if (list == null)
+ list = new ArrayList<TextRulerExample>();
+ if (!list.contains(example))
+ list.add(example);
+ exampleMap.put(annotation.getType().getName(), list);
+ }
+ }
+ double aLength = (double) (annotation.getEnd() - annotation.getBegin());
+ lengthMap.put(annotation.getType().getName(),
+ lengthMap.get(annotation.getType().getName()) == null ? aLength : lengthMap
+ .get(annotation.getType().getName())
+ + aLength);
+ countMap.put(annotation.getType().getName(),
+ countMap.get(annotation.getType().getName()) == null ? 1 : countMap.get(annotation
+ .getType().getName()) + 1);
+ }
+ }
+
+ List<Type> result = new ArrayList<Type>();
+ for (String typeString : exampleMap.keySet()) {
+ if (exampleMap.get(typeString).size() == exampleDocuments.getAllPositiveExamples().size())
+ result.add(exampleDocuments.getDocuments().get(0).getCAS().getTypeSystem().getType(
+ typeString));
+ }
+ double exLength = 0;
+ for (TextRulerExample ex : exampleDocuments.getAllPositiveExamples()) {
+ exLength += (double) (ex.getAnnotation().getEnd() - ex.getAnnotation().getBegin());
+ }
+ double bestRatio = 0;
+ Type bestType = null;
+ for (Type type : result) {
+ if ((exLength / lengthMap.get(type.getName()) > bestRatio && countMap.get(type.getName()) <= exampleDocuments
+ .getAllPositiveExamples().size())
+ || (exLength / lengthMap.get(type.getName()) == bestRatio
+ && countMap.get(type.getName()) > countMap.get(bestType.getName()) && countMap
+ .get(type.getName()) <= exampleDocuments.getAllPositiveExamples().size())) {
+ bestType = type;
+ bestRatio = exLength / lengthMap.get(type.getName());
+ }
+
+ }
+ sendStatusUpdateToDelegate("Searching for Blocks done", TextRulerLearnerState.ML_RUNNING, true);
+ return bestType;
+ }
+
+ private void learnRules(TextRulerTarget target) {
+
+ List<KEPRule> ruleList = this.ruleLists.get(target.getSingleSlotTypeName());
+ List<TextRulerExample> coveredExamples = this.coveredExamples.get(target
+ .getSingleSlotTypeName());
+ List<TextRulerExample> positiveExamples = this.exampleDocuments.getAllPositiveExamples();
+
+ for (TextRulerExample e : positiveExamples) {
+ if (!coveredExamples.contains(e)) {
+ ruleList.addAll(makeInFillerRulesForExample(e));
+ // ruleList.addAll(generalizeForRepitition(ruleList));
+ }
+ for (KEPRule rule : ruleList)
+ for (TextRulerExample ex : rule.getCoveringStatistics().getCoveredPositiveExamples())
+ if (!coveredExamples.contains(ex))
+ coveredExamples.add(ex);
+
+ if (shouldAbort())
+ return;
+ }
+ ruleList.addAll(getCandidateClassificationRules(target));
+ ruleList = getBestAndOptimalRules(ruleList);
+ ruleList.addAll(makePostFillers(ruleList, true));
+ ruleList = getOptimalRuleCombination(ruleList);
+ this.ruleLists.put(target.getSingleSlotTypeName(), ruleList);
+ }
+
+ private List<KEPRule> makeInFillerRulesForExample(TextRulerExample e) {
+ sendStatusUpdateToDelegate("Searching for Infiller Rules for "
+ + e.getTarget().getSingleSlotTypeName().substring(
+ e.getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1),
+ TextRulerLearnerState.ML_RUNNING, false);
+ Collection<KEPRule> rules = new HashSet<KEPRule>();
+
+ rules = new HashSet<KEPRule>();
+ rules = expandInFillerRules(e, rules, true);
+ if (rules.size() > maxInfillerRules) {
+ List<KEPRule> list = new ArrayList<KEPRule>(rules);
+ Collections.sort(list, new KEPRuleComparator(e.getDocumentCAS()));
+ // TODO this is a parameter!
+ rules = new HashSet<KEPRule>(list.subList(0, maxInfillerRules));
+ }
+ List<KEPRule> result = new ArrayList<KEPRule>(rules);
+ if (shouldAbort())
+ return result;
+
+ testRulesOnDocumentSet(result, exampleDocuments);
+ sendStatusUpdateToDelegate("Searching for Infiller Rules for "
+ + e.getTarget().getSingleSlotTypeName().substring(
+ e.getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1) + " done",
+ TextRulerLearnerState.ML_RUNNING, true);
+ return new ArrayList<KEPRule>(rules);
+ }
+
+ private Collection<KEPRule> expandInFillerRules(TextRulerExample e, Collection<KEPRule> rules,
+ boolean expanding) {
+
+ if (rules.size() > maxExpandRules) {
+ List<KEPRule> list = new ArrayList<KEPRule>(rules);
+ Collections.sort(list, new KEPRuleComparator(e.getDocumentCAS()));
+ rules = new HashSet<KEPRule>(list.subList(0, maxExpandRules));
+ }
+ if (!expanding) {
+ return rules;
+ }
+
+ Collection<KEPRule> expandedRules = new HashSet<KEPRule>();
+ if (rules.isEmpty()) {
+ List<AnnotationFS> seeds = getAnnotationsStartingAt(e.getDocumentCAS(), e.getAnnotation()
+ .getBegin(), e.getAnnotation().getEnd());
+ for (AnnotationFS each : seeds) {
+ KEPRuleItem item = new KEPRuleItem(each);
+ KEPRule rule = new KEPRule(this, e.getTarget());
+ rule.addInFillerItem(item);
+ expandedRules.add(rule);
+ }
+ } else {
+ expanding = false;
+ for (KEPRule eachRule : rules) {
+ TextRulerRulePattern inFiller = eachRule.getInFiller();
+ KEPRuleItem lastItem = (KEPRuleItem) inFiller.get(inFiller.size() - 1);
+ int end = lastItem.getEnd();
+ if (end >= e.getAnnotation().getEnd()) {
+ if (!expandedRules.contains(eachRule)) {
+ expandedRules.add(eachRule);
+ }
+ } else {
+ List<AnnotationFS> annotationsStartingAt = getAnnotationsStartingAt(e.getDocumentCAS(),
+ end, e.getAnnotation().getEnd());
+ if (annotationsStartingAt.isEmpty() && !expandedRules.contains(eachRule)) {
+ expandedRules.add(eachRule);
+ } else {
+ expanding = true;
+ for (AnnotationFS eachFS : annotationsStartingAt) {
+ if (eachFS.getType().getName().equals(lastItem.getType().getName())) {
+ lastItem.setAnnotation(eachFS);
+ lastItem.setMax(lastItem.getMax() + 1).setReluctant(true);
+ expandedRules.add(eachRule);
+ } else {
+ KEPRule newRule = new KEPRule(eachRule);
+ KEPRuleItem newItem = new KEPRuleItem(eachFS);
+ newRule.addInFillerItem(newItem);
+ expandedRules.add(newRule);
+ }
+ }
+ }
+ }
+ }
+ }
+ return expandInFillerRules(e, expandedRules, expanding);
+ }
+
+ private List<KEPRule> getCandidateClassificationRules(TextRulerTarget target) {
+ sendStatusUpdateToDelegate("Searching for Candidate Classification Rules for "
+ + target.getSingleSlotTypeName().substring(
+ target.getSingleSlotTypeName().lastIndexOf(".") + 1),
+ TextRulerLearnerState.ML_RUNNING, false);
+ List<KEPRule> result = new ArrayList<KEPRule>();
+ List<Type> types = getTokensInNExamples(exampleDocuments.getAllPositiveExamples(),
+ exampleDocuments.getAllPositiveExamples().size() / 2, true);
+
+ for (Type type : types) {
+ result.add(new KEPRule(this, target).addInFillerItem(new KEPRuleItem(type)));
+ }
+ testRulesOnDocumentSet(result, exampleDocuments);
+ // result = getBestAndOptimalRules(result);
+ result = addConditions(result, target);
+ sendStatusUpdateToDelegate("Searching for Candidate Classification Rules for "
+ + target.getSingleSlotTypeName().substring(
+ target.getSingleSlotTypeName().lastIndexOf(".") + 1) + " done",
+ TextRulerLearnerState.ML_RUNNING, true);
+ return result;
+ }
+
+ private List<KEPRule> addConditions(List<KEPRule> rules, TextRulerTarget target) {
+ List<KEPRule> result = new ArrayList<KEPRule>();
+ List<TextRulerExample> allCoveredExamples = getCoveredExamples(rules);
+
+ List<Type> containedTypes = getTokensInNExamples(exampleDocuments.getAllPositiveExamples(),
+ exampleDocuments.getAllPositiveExamples().size() / 3, false);
+ if (!containedTypes.isEmpty()) {
+ for (KEPRule rule : rules) {
+ KEPRuleItem ruleItem = (KEPRuleItem) rule.getInFiller().get(0);
+ for (Type type : containedTypes) {
+ if (!type.getName().equals(ruleItem.getType().getName())
+ && !ruleItem.containsAndCondition(type)) {
+ result.add(new KEPRule(this, target).addInFillerItem(ruleItem.copy().addAndCondition(
+ new KEPRuleItemCondition(type, Condition.CONTAINS, false))));
+ }
+ }
+ }
+ }
+
+ testRulesOnDocumentSet(result, exampleDocuments);
+ result = getBestAndOptimalRules(result);
+ List<KEPRule> toRefine = new ArrayList<KEPRule>();
+ List<KEPRule> toRemove = new ArrayList<KEPRule>();
+ for (KEPRule rule : result) {
+ if (rule.getCoveringStatistics().getCoveredPositivesCount() == 0)
+ toRemove.add(rule);
+ else if (rule.getCoveringStatistics().getCoveredNegativesCount() > 0
+ && rule.getPostFiller().size() < 5)
+ toRefine.add(rule);
+ }
+ result.removeAll(toRemove);
+ result.removeAll(toRefine);
+ result = getBestAndOptimalRules(result);
+ if (getCoveredExamples(result).size() == allCoveredExamples.size()) {
+ return result;
+ }
+ if (toRefine.size() > 0) {
+ result.addAll(addConditions(toRefine, target));
+ }
+ sendStatusUpdateToDelegate("Adding conditions to rules for "
+ + target.getSingleSlotTypeName().substring(
+ target.getSingleSlotTypeName().lastIndexOf(".") + 1) + " done",
+ TextRulerLearnerState.ML_RUNNING, true);
+ return result;
+ }
+
+ private List<KEPRule> makePostFillers(List<KEPRule> baseRules, boolean changed) {
+ if (!baseRules.isEmpty() && !shouldAbort()) {
+ sendStatusUpdateToDelegate("Adding postfillers to rules for "
+ + baseRules.get(0).getTarget().getSingleSlotTypeName().substring(
+ baseRules.get(0).getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1),
+ TextRulerLearnerState.ML_RUNNING, true);
+ } else {
+ return new ArrayList<KEPRule>();
+ }
+// List<TextRulerExample> allCoveredExamples = getCoveredExamples(baseRules);
+ Set<KEPRule> result = new HashSet<KEPRule>();
+ for (KEPRule rule : baseRules) {
+ for (TextRulerExample e : rule.getCoveringStatistics().getCoveredPositiveExamples()) {
+ if (rule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
+ KEPRuleItem lastItem = ((KEPRuleItem) rule.getPostFiller().lastItem());
+ int end = e.getAnnotation().getEnd();
+ if (lastItem != null) {
+ end = lastItem.getEnd();
+ }
+ List<AnnotationFS> annotations = getAnnotationsStartingAt(e.getDocumentCAS(), end, e
+ .getDocumentCAS().getDocumentText().length());
+ boolean blockBoundaryHit = false;
+ for (AnnotationFS annotationFS : annotations) {
+ if (annotationFS.getType().equals(
+ blocks.get(baseRules.get(0).getTarget().getSingleSlotTypeName()))) {
+ blockBoundaryHit = true;
+ break;
+ }
+ }
+ if (blockBoundaryHit) {
+ continue;
+ }
+ for (AnnotationFS annotationFS : annotations) {
+ if (annotationFS.getType().getName().equals(
+ lastItem != null ? lastItem.getType().getName() : null)) {
+ lastItem.setReluctant(true).setMax(lastItem.getMax() + 1).setAnnotation(annotationFS);
+ } else {
+ result.add(rule.copy().addPostFillerItem(new KEPRuleItem(annotationFS)));
+ }
+ if (rule.getPreFiller().isEmpty())
+ result.add(rule);
+ }
+ } else {
+ result.add(rule);
+ }
+ }
+ }
+ List<KEPRule> resultList = new ArrayList<KEPRule>(result);
+ testRulesOnDocumentSet(resultList, exampleDocuments);
+ resultList.addAll(baseRules);
+ resultList = getBestAndOptimalRules(resultList);
+ if (baseRules.containsAll(resultList)) {
+ if (!changed)
+ return resultList;
+ else
+ changed = false;
+ } else {
+ changed = true;
+ }
+ resultList.addAll(makePreFillers(resultList, changed));
+ sendStatusUpdateToDelegate("Adding postfillers to rules for "
+ + baseRules.get(0).getTarget().getSingleSlotTypeName().substring(
+ baseRules.get(0).getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1)
+ + " done", TextRulerLearnerState.ML_RUNNING, true);
+ return resultList;
+ }
+
+ private List<KEPRule> makePreFillers(List<KEPRule> baseRules, boolean changed) {
+ if (!baseRules.isEmpty() && !shouldAbort()) {
+ sendStatusUpdateToDelegate("Adding prefillers to rules for "
+ + baseRules.get(0).getTarget().getSingleSlotTypeName().substring(
+ baseRules.get(0).getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1),
+ TextRulerLearnerState.ML_RUNNING, true);
+ } else {
+ return new ArrayList<KEPRule>();
+ }
+// List<TextRulerExample> allCoveredExamples = getCoveredExamples(baseRules);
+ Set<KEPRule> result = new HashSet<KEPRule>();
+ for (KEPRule rule : baseRules) {
+ for (TextRulerExample e : rule.getCoveringStatistics().getCoveredPositiveExamples()) {
+ if (rule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
+ int begin = e.getAnnotation().getBegin();
+ KEPRuleItem firstItem = (KEPRuleItem) rule.getPreFiller().firstItem();
+ if (firstItem != null) {
+ begin = firstItem.getBegin();
+ }
+ List<AnnotationFS> annotations = getAnnotationsEndingAt(begin, e.getDocumentCAS());
+ boolean blockBoundaryHit = false;
+ for (AnnotationFS annotationFS : annotations) {
+ if (annotationFS.getType().equals(
+ blocks.get(baseRules.get(0).getTarget().getSingleSlotTypeName()))) {
+ blockBoundaryHit = true;
+ break;
+ }
+ }
+ if (blockBoundaryHit) {
+ continue;
+ }
+ for (AnnotationFS annotationFS : annotations) {
+ if (annotationFS.getType().getName().equals(
+ firstItem != null ? firstItem.getType().getName() : null)) {
+ firstItem.setReluctant(true).setMax(firstItem.getMax() + 1).setAnnotation(
+ annotationFS);
+ } else {
+ result.add(rule.copy().addPreFillerItem(new KEPRuleItem(annotationFS)));
+ }
+ }
+ } else {
+ result.add(rule);
+ }
+ }
+ }
+ List<KEPRule> resultList = new ArrayList<KEPRule>(result);
+ testRulesOnDocumentSet(resultList, exampleDocuments);
+ resultList.addAll(baseRules);
+ resultList = getBestAndOptimalRules(resultList);
+ if (baseRules.containsAll(resultList)) {
+ if (!changed)
+ return resultList;
+ else
+ changed = false;
+ } else {
+ changed = true;
+ }
+ resultList.addAll(makePostFillers(resultList, changed));
+
+ sendStatusUpdateToDelegate("Adding prefillers to rules for "
+ + baseRules.get(0).getTarget().getSingleSlotTypeName().substring(
+ baseRules.get(0).getTarget().getSingleSlotTypeName().lastIndexOf(".") + 1)
+ + " done", TextRulerLearnerState.ML_RUNNING, true);
+ return resultList;
+ }
+
+ private void removeBadRules() {
+ for (int i = 0; i < slotNames.length; i++) {
+ if (!hasPerfectRules.get(slotNames[i])
+ && hasPerfectRules.get(slotNamesWithBoundaries[3 * i + 1])
+ && hasPerfectRules.get(slotNamesWithBoundaries[3 * i + 2])) {
+ List<KEPRule> list = new ArrayList<KEPRule>();
+ for (KEPRule kepRule : ruleLists.get(slotNames[i])) {
+ List<TextRulerExample> exList = new ArrayList<TextRulerExample>(kepRule
+ .getCoveringStatistics().getCoveredNegativeExamples());
+ exList.removeAll(getCorrectedExamples(slotNames[i]));
+ if (exList.size() == 0) {
+ list.add(kepRule);
+ }
+ }
+ ruleLists.put(slotNames[i], list);
+ } else {
+ if (!hasPerfectRules.get(slotNamesWithBoundaries[3 * i + 1])) {
+ List<KEPRule> list = new ArrayList<KEPRule>();
+ for (KEPRule kepRule : ruleLists.get(slotNamesWithBoundaries[3 * i + 1])) {
+ List<TextRulerExample> exList = new ArrayList<TextRulerExample>(kepRule
+ .getCoveringStatistics().getCoveredNegativeExamples());
+ exList.removeAll(getCorrectedExamples(slotNamesWithBoundaries[3 * i + 1]));
+ if (exList.size() == 0) {
+ list.add(kepRule);
+ }
+ }
+ }
+ if (!hasPerfectRules.get(slotNamesWithBoundaries[3 * i + 2])) {
+ List<KEPRule> list = new ArrayList<KEPRule>();
+ for (KEPRule kepRule : ruleLists.get(slotNamesWithBoundaries[3 * i + 2])) {
+ List<TextRulerExample> exList = new ArrayList<TextRulerExample>(kepRule
+ .getCoveringStatistics().getCoveredNegativeExamples());
+ exList.removeAll(getCorrectedExamples(slotNamesWithBoundaries[3 * i + 2]));
+ if (exList.size() == 0) {
+ list.add(kepRule);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private List<KEPRule> makeRemovalRules(TextRulerTarget target) {
+ sendStatusUpdateToDelegate("Searching for Removal Rules for "
+ + target.getSingleSlotTypeName().substring(
+ target.getSingleSlotTypeName().lastIndexOf(".") + 1),
+ TextRulerLearnerState.ML_RUNNING, false);
+ if (!hasFalsePositives(target.getSingleSlotTypeName()))
+ return new ArrayList<KEPRule>();
+ List<KEPRule> result = correctionRules.get(target.getSingleSlotTypeName());
+ Type targetType = exampleDocuments.getDocuments().get(0).getCAS().getTypeSystem().getType(
+ target.getSingleSlotTypeName());
+
+ List<Type> containedTypes = getTokensInNExamples(exampleDocuments.getAllPositiveExamples(),
+ exampleDocuments.getAllPositiveExamples().size(), false);
+ List<Type> notContainedTypes = getTokensInNoExample(exampleDocuments.getAllPositiveExamples());
+ // notContainedTypes.retainAll(getTokensInNExamples(getFalsePositives(target), 1, false));
+ if (!containedTypes.isEmpty()) {
+ KEPRuleItem containsRuleItem = new KEPRuleItem(targetType);
+ for (Type type : containedTypes) {
+ result.add(new KEPRule(this, target).addInFillerItem(
+ containsRuleItem.copy().addAndCondition(
+ new KEPRuleItemCondition(type, Condition.CONTAINS, true)))
+ .setCorrectionRule(true));
+ }
+ }
+ if (!notContainedTypes.isEmpty()) {
+ KEPRuleItem notContainsRuleItem = new KEPRuleItem(targetType);
+ for (Type type : notContainedTypes) {
+ result.add(new KEPRule(this, target).addInFillerItem(
+ notContainsRuleItem.copy().addAndCondition(
+ new KEPRuleItemCondition(type, Condition.CONTAINS, false)))
+ .setCorrectionRule(true));
+ }
+ }
+ testCorrectionRules(target);
+ List<KEPRule> toRemove = new ArrayList<KEPRule>();
+ List<KEPRuleItemCondition> toMerge = new ArrayList<KEPRuleItemCondition>();
+ for (KEPRule rule : result) {
+ if (!(rule.getCoveringStatistics().getCoveredPositivesCount() == 0)
+ || !(rule.getCoveringStatistics().getCoveredNegativesCount() > 0)) {
+ toRemove.add(rule);
+ } else {
+ toMerge.addAll(((KEPRuleItem) rule.getInFiller().get(0)).getConditions().get(0));
+ toRemove.add(rule);
+ }
+ }
+ result.removeAll(toRemove);
+ if (!toMerge.isEmpty()) {
+ result.add(new KEPRule(this, target).addInFillerItem(
+ new KEPRuleItem(targetType).addConditions(toMerge)).setCorrectionRule(true));
+ testCorrectionRules(target);
+ }
+ sendStatusUpdateToDelegate("Searching for Removal Rules for "
+ + target.getSingleSlotTypeName().substring(
+ target.getSingleSlotTypeName().lastIndexOf(".") + 1) + " done",
+ TextRulerLearnerState.ML_RUNNING, true);
+ return result;
+ }
+
+ private void initializeMapEntries(String slotName) {
+ this.ruleLists.put(slotName, new ArrayList<KEPRule>());
+ this.correctionRules.put(slotName, new ArrayList<KEPRule>());
+ this.coveredExamples.put(slotName, new ArrayList<TextRulerExample>());
+ this.hasPerfectRules.put(slotName, false);
+ }
+
+ private List<AnnotationFS> getAnnotationsEndingAt(int end, CAS cas) {
+ List<AnnotationFS> result = new ArrayList<AnnotationFS>();
+ FSIterator<AnnotationFS> it = cas.getAnnotationIndex(
+ cas.getTypeSystem().getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME)).iterator();
+ while (it.isValid() && it.get().getBegin() < end) {
+ it.moveToNext();
+ }
+ do
+ it.moveToPrevious();
+ while (it.isValid()
+ && (it.get().getBegin() >= end || filterSetWithSlotNames.contains(it.get().getType()
+ .getName())));
+ if (!it.isValid())
+ return result;
+ end = it.get().getEnd();
+ it = cas.getAnnotationIndex().iterator();
+ while (it.isValid() && it.get().getBegin() <= end) {
+ if (it.get().getEnd() == end
+ && !filterSetWithSlotNames.contains(it.get().getType().getName()))
+ result.add(it.get());
+ it.moveToNext();
+ }
+ return result;
+ }
+
+ private List<AnnotationFS> getAnnotationsStartingAt(CAS cas, int begin, int till) {
+ List<AnnotationFS> result = new ArrayList<AnnotationFS>();
+ if (begin > cas.getDocumentText().length()) {
+ return new ArrayList<AnnotationFS>();
+ }
+ if (begin == 0) {
+ begin++;
+ }
+ AnnotationFS pointer = cas.createAnnotation(cas.getAnnotationType(), begin - 1, begin);
+ FSIterator<AnnotationFS> it = cas.getAnnotationIndex().iterator(pointer);
+ FSMatchConstraint constraint = getConstraint(cas);
+ FSIterator<AnnotationFS> iterator = cas.createFilteredIterator(it, constraint);
+ iterator.moveTo(pointer);
+
+ int firstBegin = -1;
+ while (iterator.isValid()) {
+ AnnotationFS fs = iterator.get();
+ if (firstBegin == -1 && fs.getBegin() >= begin) {
+ firstBegin = fs.getBegin();
+ }
+ if (firstBegin >= 0) {
+ if (fs.getBegin() > firstBegin) {
+ break;
+ } else if (fs.getBegin() == firstBegin && fs.getEnd() <= till) {
+ if (!filterSetWithSlotNames.contains(fs.getType().getName())) {
+ result.add(fs);
+ }
+ }
+ }
+ iterator.moveToNext();
+ }
+ return result;
+ }
+
+ private List<KEPRule> getOptimalRuleCombination(List<KEPRule> rules) {
+
+ if (rules.isEmpty()) {
+ return new ArrayList<KEPRule>();
+ }
+ List<KEPRule> tmpList = new ArrayList<KEPRule>();
+ List<TextRulerExample> coveredExamples = new ArrayList<TextRulerExample>();
+ List<TextRulerExample> positiveExamples = exampleDocuments.getAllPositiveExamples();
+ List<TextRulerExample> correctedExamples = getCorrectedExamples(rules.get(0).getTarget()
+ .getSingleSlotTypeName());
+ for (KEPRule rule : rules) {
+ List<TextRulerExample> uncorrectedExamples = new ArrayList<TextRulerExample>(rule
+ .getCoveringStatistics().getCoveredNegativeExamples());
+ uncorrectedExamples.removeAll(correctedExamples);
+ if (uncorrectedExamples.size() == 0
+ && rule.getCoveringStatistics().getCoveredPositivesCount() > 0)
+ tmpList.add(rule);
+ }
+ tmpList = getBestRules(tmpList);
+ for (KEPRule rule : tmpList) {
+ coveredExamples.addAll(rule.getCoveringStatistics().getCoveredPositiveExamples());
+ }
+ if (coveredExamples.containsAll(exampleDocuments.getAllPositiveExamples()))
+ hasPerfectRules.put(rules.get(0).getTarget().getSingleSlotTypeName(), true);
+ else
+ hasPerfectRules.put(rules.get(0).getTarget().getSingleSlotTypeName(), false);
+ List<KEPRule> bestRules = getBestRules(rules);
+ while (!coveredExamples.containsAll(positiveExamples) && !bestRules.isEmpty()) {
+ KEPRule bestRule = bestRules.get(0);
+ if (!coveredExamples.containsAll(bestRule.getCoveringStatistics()
+ .getCoveredPositiveExamples())) {
+ coveredExamples.removeAll(bestRule.getCoveringStatistics().getCoveredPositiveExamples());
+ coveredExamples.addAll(bestRule.getCoveringStatistics().getCoveredPositiveExamples());
+ tmpList.add(bestRule);
+ }
+ bestRules.remove(0);
+ }
+ return tmpList;
+
+ }
+
+ private List<KEPRule> getBestRules(List<KEPRule> rules) {
+ if (rules.isEmpty())
+ return new ArrayList<KEPRule>();
+ final class AComparator implements Comparator<KEPRule> {
+ public int compare(KEPRule r1, KEPRule r2) {
+ if (r1.getCoveringStatistics().getCoveredPositivesCount() < r2.getCoveringStatistics()
+ .getCoveredPositivesCount())
+ return 1;
+ else if (r1.getCoveringStatistics().getCoveredPositivesCount() > r2.getCoveringStatistics()
+ .getCoveredPositivesCount())
+ return -1;
+ else if (r1.getCoveringStatistics().getCoveredNegativesCount() > r2.getCoveringStatistics()
+ .getCoveredNegativesCount())
+ return 1;
+ else if (r1.getCoveringStatistics().getCoveredNegativesCount() < r2.getCoveringStatistics()
+ .getCoveredNegativesCount())
+ return -1;
+ else if (r1.getPreFiller().size() + r1.getInFiller().size() + r1.getPostFiller().size() < r2
+ .getPreFiller().size()
+ + r2.getInFiller().size() + r2.getPostFiller().size())
+ return -1;
+ return 0;
+ }
+ }
+
+ Collections.sort(rules, new AComparator());
+ List<KEPRule> result = new ArrayList<KEPRule>();
+ List<TextRulerExample> positiveExamples = exampleDocuments.getAllPositiveExamples();
+ List<TextRulerExample> coveredExamples = new ArrayList<TextRulerExample>();
+ for (int i = 0; i < rules.size(); i++) {
+ KEPRule rule = rules.get(i);
+ if ((3 * rule.getCoveringStatistics().getCoveredPositivesCount() >= rule
+ .getCoveringStatistics().getCoveredNegativesCount())
+ && (rule.getCoveringStatistics().getCoveredPositivesCount() >= positiveExamples
+ .size() || !coveredExamples.containsAll(rule.getCoveringStatistics()
+ .getCoveredPositiveExamples()))) {
+ result.add(rule);
+ coveredExamples.addAll(rule.getCoveringStatistics().getCoveredPositiveExamples());
+
+ if (coveredExamples.containsAll(positiveExamples))
+ return result;
+ }
+ }
+ for (int i = 0; i < rules.size(); i++) {
+ KEPRule rule = rules.get(i);
+ if (rule.getCoveringStatistics().getCoveredPositivesCount() >= positiveExamples.size()
+ || !coveredExamples.containsAll(rule.getCoveringStatistics()
+ .getCoveredPositiveExamples())) {
+ result.add(rule);
+ coveredExamples.addAll(rule.getCoveringStatistics().getCoveredPositiveExamples());
+
+ if (coveredExamples.containsAll(positiveExamples))
+ return result;
+ }
+ }
+ return result;
+ }
+
+ private List<KEPRule> getBestAndOptimalRules(List<KEPRule> rules) {
+ List<KEPRule> result = new ArrayList<KEPRule>();
+
+ result.addAll(getBestRules(rules));
+ List<KEPRule> tmp = getOptimalRuleCombination(rules);
+ for (KEPRule rule : tmp)
+ if (!result.contains(rule))
+ result.add(rule);
+ return result;
+ }
+
+ private List<Type> getTokensInNExamples(List<TextRulerExample> examples, int n,
+ boolean countOnlyCoveringTokens) {
+ if (examples.isEmpty())
+ return new ArrayList<Type>();
+ List<Type> result = new ArrayList<Type>();
+ Map<String, List<TextRulerExample>> countMap = new HashMap<String, List<TextRulerExample>>();
+ for (TextRulerExample example : examples) {
+ for (AnnotationFS a : TextRulerToolkit.getAnnotationsWithinBounds(example.getDocumentCAS(),
+ example.getAnnotation().getBegin(), example.getAnnotation().getEnd(),
+ filterSetWithSlotNames, null)) {
+ if (!filterSetWithSlotNames.contains(a.getType().getName()))
+ if (((!countOnlyCoveringTokens) && (a.getBegin() >= example.getAnnotation().getBegin() && a
+ .getEnd() <= example.getAnnotation().getEnd()))
+ || (a.getBegin() == example.getAnnotation().getBegin() && a.getEnd() == example
+ .getAnnotation().getEnd())) {
+ List<TextRulerExample> list = countMap.get(a.getType().getName());
+ if (list == null) {
+ list = new ArrayList<TextRulerExample>();
+ list.add(example);
+ } else if (!list.contains(example))
+ list.add(example);
+ countMap.put(a.getType().getName(), list);
+ }
+ }
+ }
+ for (String typeString : countMap.keySet()) {
+ if (countMap.get(typeString).size() >= n)
+ result.add(examples.get(0).getDocumentCAS().getTypeSystem().getType(typeString));
+ }
+ return result;
+ }
+
+ private List<Type> getTokensInNoExample(List<TextRulerExample> examples) {
+ List<String> types = new ArrayList<String>();
+ for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+ for (AnnotationFS a : doc.getCAS().getAnnotationIndex()) {
+ if (!types.contains(a.getType().getName())
+ && !filterSetWithSlotNames.contains(a.getType().getName()))
+ types.add(a.getType().getName());
+ }
+ }
+ List<Type> containedTypes = getTokensInNExamples(examples, 1, false);
+ for (Type type : containedTypes) {
+ types.remove(type.getName());
+ }
+ List<Type> result = new ArrayList<Type>();
+ for (String typeString : types) {
+ result.add(examples.get(0).getDocumentCAS().getTypeSystem().getType(typeString));
+ }
+ return result;
+ }
+
+ public String getResultString() {
+ StringBuffer ruleStrings = new StringBuffer();
+ if (slotNamesWithBoundaries == null || slotNamesWithBoundaries.length == 0)
+ return "No results available yet!";
+
+ for (int i = 0; i < slotNamesWithBoundaries.length; i++) {
+ List<KEPRule> ruleList = this.ruleLists.get(slotNamesWithBoundaries[i]);
+ Type blockType = blocks.get(slotNamesWithBoundaries[i]);
+ if (blockType != null
+ && !(i > 0 && blocks.get(slotNamesWithBoundaries[i - 1]) != null && blocks.get(
+ slotNamesWithBoundaries[i - 1]).getName().equals(blockType.getName()))) {
+ ruleStrings.append("BLOCK(" + blockType.getShortName() + ") " + blockType.getShortName()
+ + "{} { \n");
+ }
+ if (ruleList == null || ruleList.isEmpty()) {
+ if (blockType != null
+ && !(i < slotNamesWithBoundaries.length - 1
+ && blocks.get(slotNamesWithBoundaries[i + 1]) != null && blocks.get(
+ slotNamesWithBoundaries[i + 1]).getName().equals(blockType.getName())))
+ ruleStrings.append("} \n");
+ continue;
+ }
+ ruleStrings.append("// " + slotNamesWithBoundaries[i] + " RULES \n");
+ for (KEPRule rule : new ArrayList<KEPRule>(ruleList)) {
+ ruleStrings.append((blockType != null ? "\t" : "") + rule.getRuleString() + "\t// "
+ + rule.getCoveringStatistics() + "\n");
+ }
+ if (blockType != null
+ && !(i < slotNamesWithBoundaries.length - 1
+ && blocks.get(slotNamesWithBoundaries[i + 1]) != null && blocks.get(
+ slotNamesWithBoundaries[i + 1]).getName().equals(blockType.getName())))
+ ruleStrings.append("}");
+ ruleStrings.append("\n");
+ }
+ StringBuffer boundaryCorrectors = new StringBuffer();
+ StringBuffer wholeSlotCorrectors = new StringBuffer();
+ boundaryCorrectors.append("\n // BOUNDARY CORRECTION RULES: \n");
+ wholeSlotCorrectors.append("\n // CORRECTION RULES: \n");
+ for (int i = 0; i < slotNamesWithBoundaries.length; i++) {
+ List<KEPRule> ruleList = this.correctionRules.get(slotNamesWithBoundaries[i]);
+ if (ruleList == null || ruleList.isEmpty())
+ continue;
+ for (KEPRule rule : ruleList) {
+ if (slotNamesWithBoundaries[i].contains(TextRulerToolkit.LEFT_BOUNDARY_EXTENSION)
+ || slotNamesWithBoundaries[i].contains(TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION)) {
+ boundaryCorrectors.append(rule.getRuleString() + "\t// " + rule.getCoveringStatistics()
+ + "\n");
+ } else {
+ wholeSlotCorrectors.append(rule.getRuleString() + "\t// " + rule.getCoveringStatistics()
+ + "\n");
+ }
+ }
+ }
+ return getFileHeaderString(true) + ruleStrings + boundaryCorrectors + "\n // CONNECTORS: \n"
+ + getConnectorsRuleString() + wholeSlotCorrectors;
+ }
+
+ private String getAnnotationRulesString(String slotName) {
+ StringBuffer result = new StringBuffer();
+ result.append(getPackageString());
+ result.append("// " + slotName + " RULES \n");
+ Type blockType = blocks.get(slotName);
+ if (blockType != null) {
+ result.append("BLOCK(" + blockType.getShortName() + ") " + blockType.getShortName()
+ + "{} { \n");
+ }
+ List<KEPRule> ruleList = this.ruleLists.get(slotName);
+ if (ruleList != null && !ruleList.isEmpty()) {
+
+ for (KEPRule rule : ruleList) {
+ String theRuleString = rule.getRuleString();
+ result.append((blockType != null ? "\t" : "") + theRuleString + "\t// "
+ + rule.getCoveringStatistics() + "\n");
+ }
+ }
+ if (blockType != null) {
+ result.append("}");
+ }
+ result.append("\n");
+
+ if (!slotName.contains(TextRulerToolkit.LEFT_BOUNDARY_EXTENSION)
+ && !slotName.contains(TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION)) {
+ result.append(getAnnotationRulesString(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION));
+ result.append(getAnnotationRulesString(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION));
+ if (blockType != null) {
+ result.append("BLOCK(" + blockType.getShortName() + "Correction) "
+ + blockType.getShortName() + "{} { \n");
+ }
+ String shortName = slotName.substring(slotName.lastIndexOf(".") + 1);
+ String str = shortName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION + "{->MARKONCE("
+ + shortName + ",1,3)} ANY*? " + shortName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION
+ + ";" + "\n";
+ str += shortName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION + "{IS(" + shortName
+ + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION + ")->MARKONCE(" + shortName + ")} "
+ + ";" + "\n";
+ result.append(str);
+ if (blockType != null) {
+ result.append("}");
+ }
+ }
+
+ return result.toString();
+ }
+
+ private String getConnectorsRuleString() {
+
+ Map<Type, StringBuffer> connectorBlocks = new HashMap<Type, StringBuffer>();
+ StringBuffer noBlockConnectorRules = new StringBuffer();
+ StringBuffer result = new StringBuffer();
+
+ for (int i = 0; i < this.slotNames.length; i++) {
+ Type slotBlock = blocks.get(slotNames[i]);
+ String shortName = slotNames[i].substring(slotNames[i].lastIndexOf(".") + 1);
+ String str = (slotBlock == null ? "" : "\t") + shortName
+ + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION + "{->MARKONCE(" + shortName
+ + ",1,3)} ANY*? " + shortName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION + ";"
+ + "\n";
+ str += (slotBlock == null ? "" : "\t") + shortName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION
+ + "{IS(" + shortName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION + ")->MARKONCE("
+ + shortName + ")} " + ";" + "\n";
+ if (slotBlock != null) {
+ if (connectorBlocks.get(slotBlock) == null) {
+ connectorBlocks.put(slotBlock, new StringBuffer());
+ }
+ connectorBlocks.get(slotBlock).append(str);
+ } else {
+ noBlockConnectorRules.append(str);
+ }
+ }
+
+ for (Type block : connectorBlocks.keySet()) {
+ result.append("BLOCK(" + block.getShortName() + "Connectors) " + block.getShortName()
+ + "{} { \n" + connectorBlocks.get(block) + "} \n");
+ }
+ result.append(noBlockConnectorRules);
+ return result.toString();
+ }
+
+ private List<TextRulerExample> getCoveredExamples(List<KEPRule> rules) {
+ List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+ for (KEPRule rule : rules)
+ for (TextRulerExample ex : rule.getCoveringStatistics().getCoveredPositiveExamples())
+ if (!result.contains(ex))
+ result.add(ex);
+ return result;
+ }
+
+ private List<TextRulerExample> getCorrectedExamples(String slotName) {
+ List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+ for (KEPRule rule : correctionRules.get(slotName))
+ for (TextRulerExample ex : rule.getCoveringStatistics().getCoveredNegativeExamples())
+ if (!result.contains(ex))
+ result.add(ex);
+ return result;
+ }
+
+ private boolean hasFalsePositives(String singleSlotTypeName) {
+ List<KEPRule> list = this.ruleLists.get(singleSlotTypeName);
+ if (list == null || list.isEmpty())
+ return false;
+ for (KEPRule kepRule : list) {
+ if (kepRule.getCoveringStatistics().getCoveredNegativesCount() > 0)
+ return true;
+ }
+ return false;
+ }
+
+ public void testCorrectionRules(TextRulerTarget target) {
+ if (shouldAbort())
+ return;
+ String rStr = getAnnotationRulesString(target.getSingleSlotTypeName());
+ for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
+ CAS processedCAS = applyScriptOnDocument(rStr, doc, target);
+ TextRulerStatisticsCollector scriptStatistics = new TextRulerStatisticsCollector();
+ compareOriginalDocumentWithTestCAS(doc, processedCAS, target, scriptStatistics,
+ collectNegativeCoveredInstancesWhenTesting());
+ for (KEPRule cRule : correctionRules.get(target.getSingleSlotTypeName())) {
+ if (shouldAbort())
+ break;
+ if (cRule.getCoveringStatistics() == null) {
+ cRule.setCoveringStatistics(new TextRulerStatisticsCollector());
+ }
+ processedCAS = applyScriptOnDocument(rStr, doc, target);
+ TextRulerStatisticsCollector correctedStats = new TextRulerStatisticsCollector();
+ testRuleOnDocument(cRule, doc, correctedStats, processedCAS);
+ for (TextRulerExample ex : scriptStatistics.getCoveredNegativeExamples()) {
+ if (!correctedStats.getCoveredNegativeExamples().contains(ex)) {
+ cRule.getCoveringStatistics().addCoveredNegative(ex);
+ }
+ }
+ for (TextRulerExample ex : scriptStatistics.getCoveredPositiveExamples()) {
+ if (!correctedStats.getCoveredPositiveExamples().contains(ex)) {
+ cRule.getCoveringStatistics().addCoveredPositive(ex);
+ }
+ }
+ cRule.getCoveringStatistics().reflectCountsFromCoveredExamples();
+ }
+ }
+ }
+
+ private void prepareCASWithBoundaries(CAS cas) {
+ for (String slotName : slotNames)
+ if (!(slotName.contains(TextRulerToolkit.LEFT_BOUNDARY_EXTENSION) || slotName
+ .contains(TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION)))
+ TextRulerExampleDocument.createBoundaryAnnotationsForCas(cas, slotName, filterSet);
+ }
+
+ @Override
+ public CAS loadCAS(String fileName, CAS reuseCAS) {
+ CAS cas = super.loadCAS(fileName, reuseCAS);
+ prepareCASWithBoundaries(cas);
+ return cas;
+ }
+
+ private void prepareCachedCASesWithBoundaries() {
+ for (CAS cas : exampleDocuments.getCachedCASes())
+ prepareCASWithBoundaries(cas);
+ }
+
+ @Override
+ public boolean collectNegativeCoveredInstancesWhenTesting() {
+ return true;
+ }
+
+ public void setParameters(Map<String, Object> params) {
+ if (TextRulerToolkit.DEBUG)
+ saveParametersToTempFolder(params);
+
+ // TODO try catch
+ if (params.containsKey(FILLER_WINDOW))
+ fillerWindow = (Integer) params.get(FILLER_WINDOW);
+
+ if (params.containsKey(MAX_EXPAND_RULES))
+ maxExpandRules = (Integer) params.get(MAX_EXPAND_RULES);
+
+ if (params.containsKey(MAX_FILLER_LENGTH))
+ maxFillerLength = (Integer) params.get(MAX_FILLER_LENGTH);
+
+ if (params.containsKey(MAX_INFILLER_RULES))
+ maxInfillerRules = (Integer) params.get(MAX_INFILLER_RULES);
+
+ }
+
+ protected FSMatchConstraint getConstraint(CAS cas) {
+ ConstraintFactory cf = cas.getConstraintFactory();
+ final FSTypeConstraint constraint = cf.createTypeConstraint();
+
+ for (String each : getFilterSet()) {
+ constraint.add(each);
+ }
+ constraint.add(RutaEngine.BASIC_TYPE);
+ // TODO check if this is a legal alternative to "new NotConstraint(constraint)":
+ FSMatchConstraint result = new FSMatchConstraint() {
+ private static final long serialVersionUID = -6744378612440830298L;
+
+ private final FSTypeConstraint c = constraint;
+
+ public boolean match(FeatureStructure fs) {
+ return !c.match(fs);
+ }
+ };
+ return result;
+ }
+
+ public class KEPRuleComparator implements Comparator<KEPRule> {
+
+ private CAS cas;
+
+ public KEPRuleComparator(CAS cas) {
+ super();
+ this.cas = cas;
+ }
+
+ public int compare(KEPRule o1, KEPRule o2) {
+ ArrayList<TextRulerRuleItem> items1 = o1.getInFiller();
+ items1.addAll(o1.getPostFiller());
+ items1.addAll(o1.getPreFiller());
+ double occ1 = 0;
+ double occ2 = 0;
+ for (TextRulerRuleItem each : items1) {
+ KEPRuleItem eachItem = (KEPRuleItem) each;
+ int ratio = exampleDocuments.getAllPositiveExamples().size()
+ / cas.getAnnotationIndex(eachItem.getType()).size();
+ occ1 += (ratio < 1) ? 1 : ratio;
+ }
+ ArrayList<TextRulerRuleItem> items2 = o2.getInFiller();
+ items2.addAll(o2.getPostFiller());
+ items2.addAll(o2.getPreFiller());
+ for (TextRulerRuleItem each : items2) {
+ KEPRuleItem eachItem = (KEPRuleItem) each;
+ int ratio = exampleDocuments.getAllPositiveExamples().size()
+ / cas.getAnnotationIndex(eachItem.getType()).size();
+ occ2 += (ratio < 1) ? 1 : ratio;
+ }
+ double v1 = occ1;
+ double v2 = occ2;
+ if (v1 > v2) {
+ return 1;
+ } else if (v2 > v1) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+ }
+
+}
\ No newline at end of file
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPPreferencePage.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPPreferencePage.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPPreferencePage.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPPreferencePage.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.uima.ruta.textruler.TextRulerPlugin;
+import org.apache.uima.ruta.textruler.extension.TextRulerController;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerController;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.ruta.textruler.extension.TextRulerLearnerParameter;
+import org.eclipse.jface.preference.BooleanFieldEditor;
+import org.eclipse.jface.preference.FieldEditor;
+import org.eclipse.jface.preference.IPreferenceStore;
+import org.eclipse.jface.preference.PreferencePage;
+import org.eclipse.jface.preference.StringFieldEditor;
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Composite;
+import org.eclipse.swt.widgets.Control;
+import org.eclipse.ui.IWorkbench;
+import org.eclipse.ui.IWorkbenchPreferencePage;
+
+public class KEPPreferencePage extends PreferencePage implements IWorkbenchPreferencePage {
+
+ public static String ID = "org.apache.uima.ruta.textruler.preference.kep";
+
+ private TextRulerLearnerController algorithmController;
+
+ private IPreferenceStore store;
+
+ private ArrayList<FieldEditor> fields = new ArrayList<FieldEditor>();
+
+ public KEPPreferencePage() {
+ TextRulerLearnerController ctrl = TextRulerController
+ .getControllerForID("org.apache.uima.ruta.textruler.kep");
+ this.algorithmController = ctrl;
+ store = TextRulerPlugin.getDefault().getPreferenceStore();
+ setPreferenceStore(store);
+ }
+
+ @Override
+ protected Control createContents(Composite parent) {
+ Composite top = new Composite(parent, SWT.LEFT);
+ top.setLayoutData(new GridData(GridData.FILL_HORIZONTAL));
+ top.setLayout(new GridLayout());
+
+ TextRulerLearnerFactory f = algorithmController.getFactory();
+ TextRulerLearnerParameter[] params = f.getAlgorithmParameters();
+ Map<String, Object> values = f.getAlgorithmParameterStandardValues();
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ TextRulerLearnerParameter p = params[i];
+ String id = algorithmController.getID() + "." + p.id;
+ FieldEditor l = null;
+ switch (p.type) {
+ case ML_BOOL_PARAM: {
+ l = new BooleanFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, (Boolean) values.get(p.id));
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+
+ case ML_FLOAT_PARAM:
+ case ML_INT_PARAM:
+ case ML_STRING_PARAM: {
+ l = new StringFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, values.get(p.id).toString());
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+ case ML_SELECT_PARAM:
+ break;
+ }
+ }
+ }
+ return top;
+ }
+
+ public void init(IWorkbench workbench) {
+ }
+
+ @Override
+ protected void performDefaults() {
+ for (FieldEditor f : fields)
+ f.loadDefault();
+ // super.performDefaults();
+ }
+
+ @Override
+ public boolean performOk() {
+ for (FieldEditor f : fields)
+ f.store();
+ // return super.performOk();
+ return true;
+ }
+}
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRule.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRule.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRule.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.ruta.textruler.core.TextRulerMultiSlotRule;
+import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
+import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
+import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
+import org.apache.uima.ruta.textruler.core.TextRulerTarget;
+
+public class KEPRule extends TextRulerMultiSlotRule {
+
+ private boolean isCorrectionRule = false;
+
+ public KEPRule(KEPRule copyFrom) {
+ super(copyFrom);
+ this.isCorrectionRule = copyFrom.isCorrectionRule;
+ }
+
+ public KEPRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+ super(parentAlgorithm, target);
+ slotPatterns.add(new TextRulerSlotPattern());
+ }
+
+ @Override
+ public KEPRule copy() {
+ return new KEPRule(this);
+ }
+
+ public double getLaplacian() {
+ int e = 0;
+ int n = 0;
+
+ if (coveringStatistics != null) {
+ e = coveringStatistics.getCoveredNegativesCount();
+ n = coveringStatistics.getCoveredNegativesCount()
+ + coveringStatistics.getCoveredPositivesCount();
+ }
+ return ((double) e + 1) / ((double) n + 1);
+ }
+
+ public boolean containsTerm(KEPRuleItem term) {
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ for (TextRulerRuleItem i : sp.preFillerPattern)
+ if (i.equals(term))
+ return true;
+ for (TextRulerRuleItem i : sp.fillerPattern)
+ if (i.equals(term))
+ return true;
+ for (TextRulerRuleItem i : sp.postFillerPattern)
+ if (i.equals(term))
+ return true;
+ }
+ return false;
+ }
+
+ public KEPRuleItem searchNeighborOfItem(KEPRuleItem item, boolean goToLeft) {
+ int slotIndex = -1;
+ int patternIndex = -1;
+ int slotI = 0;
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ for (TextRulerRuleItem it : sp.preFillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 0; // 0=preFiller
+ break;
+ }
+ }
+ if (slotIndex < 0) {
+ for (TextRulerRuleItem it : sp.fillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 1; // 1=filler
+ break;
+ }
+ }
+ }
+ if (slotIndex < 0) {
+ for (TextRulerRuleItem it : sp.postFillerPattern) {
+ if (it == item) {
+ slotIndex = slotI;
+ patternIndex = 2; // 2=postFiller
+ break;
+ }
+ }
+ }
+ if (slotIndex >= 0) {
+ break;
+ }
+ }
+ if (slotIndex < 0) // we didn't even find the item in our rule ?! how
+ // can this happen ?
+ return null;
+
+ TextRulerRulePattern currentPattern = getPattern(slotIndex, patternIndex);
+ while (currentPattern != null) {
+ int startIndex = currentPattern.indexOf(item); // this is only >= 0
+ // for the first
+ // pattern...
+ if (!goToLeft) // walk forward...
+ {
+ int startSearchFromIndex = startIndex + 1;
+ if (startSearchFromIndex < currentPattern.size())
+ return (KEPRuleItem) currentPattern.get(startSearchFromIndex);
+ else // skip to next pattern
+ {
+ patternIndex++;
+ if (patternIndex > 2) {
+ patternIndex = 0;
+ slotIndex++;
+ if (slotIndex >= slotPatterns.size())
+ return null; // not found!
+ }
+ currentPattern = getPattern(slotIndex, patternIndex);
+ }
+ } else {
+ int startSearchFromIndex = startIndex >= 0 ? startIndex - 1 : currentPattern.size() - 1;
+ if (startSearchFromIndex >= 0 && currentPattern.size() > 0)
+ return (KEPRuleItem) currentPattern.get(startSearchFromIndex);
+ else // skip to previous pattern
+ {
+ patternIndex--;
+ if (patternIndex < 0) {
+ patternIndex = 2;
+ slotIndex--;
+ if (slotIndex < 0)
+ return null; // not found!
+ }
+ currentPattern = getPattern(slotIndex, patternIndex);
+ }
+ }
+ }
+ return null;
+ }
+
+ private TextRulerRulePattern getPattern(int slotIndex, int patternIndex) {
+ TextRulerSlotPattern sp = slotPatterns.get(slotIndex);
+ if (patternIndex == 0)
+ return sp.preFillerPattern;
+ else if (patternIndex == 1)
+ return sp.fillerPattern;
+ else if (patternIndex == 2)
+ return sp.postFillerPattern;
+ else
+ return null;
+ }
+
+ public KEPRule addPostFillerItem(KEPRuleItem item) {
+ this.getPostFiller().add(item);
+ setNeedsCompile(true);
+ return this;
+ }
+
+ public KEPRule addInFillerItem(KEPRuleItem item) {
+ this.getInFiller().add(item);
+ setNeedsCompile(true);
+ return this;
+ }
+
+ public KEPRule addPreFillerItem(KEPRuleItem item) {
+ this.getPreFiller().add(0, item);
+ setNeedsCompile(true);
+ return this;
+ }
+
+ public TextRulerRulePattern getPreFiller() {
+ return this.slotPatterns.get(0).preFillerPattern;
+ }
+
+ public void setPreFiller(TextRulerRulePattern preFiller) {
+ this.slotPatterns.get(0).preFillerPattern = preFiller;
+ setNeedsCompile(true);
+ }
+
+ public TextRulerRulePattern getInFiller() {
+ return this.slotPatterns.get(0).fillerPattern;
+ }
+
+ public void setInFiller(TextRulerRulePattern inFiller) {
+ this.slotPatterns.get(0).fillerPattern = inFiller;
+ setNeedsCompile(true);
+ }
+
+ public TextRulerRulePattern getPostFiller() {
+ return this.slotPatterns.get(0).postFillerPattern;
+ }
+
+ public void setPostFiller(TextRulerRulePattern postFiller) {
+ this.slotPatterns.get(0).postFillerPattern = postFiller;
+ setNeedsCompile(true);
+ }
+
+ public KEPRule setCorrectionRule(boolean isCorrectionRule) {
+ this.isCorrectionRule = isCorrectionRule;
+ setNeedsCompile(true);
+ return this;
+ }
+
+ public boolean isCorrectionRule() {
+ return isCorrectionRule;
+ }
+
+ public TextRulerTarget getTarget() {
+ return this.target;
+ }
+
+ public void setTarget(TextRulerTarget target) {
+ this.target = target;
+ setNeedsCompile(true);
+ }
+
+ public boolean coversSameExamples(KEPRule otherRule) {
+ if (otherRule.getCoveringStatistics().getCoveredPositivesCount() != this
+ .getCoveringStatistics().getCoveredPositivesCount()
+ || !otherRule.getCoveringStatistics().getCoveredPositiveExamples().containsAll(
+ this.getCoveringStatistics().getCoveredPositiveExamples()))
+ return false;
+ return true;
+ }
+}
\ No newline at end of file
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItem.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItem.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItem.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
+import org.apache.uima.ruta.textruler.core.TextRulerRule;
+import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
+
+public class KEPRuleItem implements TextRulerRuleItem {
+
+ private boolean isStarWildCard = false;
+
+ private boolean isReluctant = false;
+
+ private Type type;
+
+ private TextRulerAnnotation annotation;
+
+ private int min = 1;
+
+ private int max = 1;
+
+ private List<List<KEPRuleItemCondition>> conditions = new ArrayList<List<KEPRuleItemCondition>>();
+
+ public KEPRuleItem(KEPRuleItem copyFrom) {
+ super();
+ this.annotation = copyFrom.annotation;
+ this.isStarWildCard = copyFrom.isStarWildCard;
+ this.type = copyFrom.type;
+ this.conditions = new ArrayList<List<KEPRuleItemCondition>>();
+ for (List<KEPRuleItemCondition> cList : copyFrom.conditions) {
+ this.conditions.add(new ArrayList<KEPRuleItemCondition>(cList));
+ }
+ this.isReluctant = copyFrom.isReluctant;
+ this.min = copyFrom.min;
+ this.max = copyFrom.max;
+ }
+
+ public KEPRuleItem(Type type) {
+ super();
+ this.type = type;
+ }
+
+ public KEPRuleItem(TextRulerAnnotation a) {
+ super();
+ this.type = a.getType();
+ this.annotation = a;
+ }
+
+ public KEPRuleItem(AnnotationFS afs) {
+ super();
+ this.annotation = new TextRulerAnnotation(afs);
+ this.type = this.annotation.getType();
+ }
+
+ public KEPRuleItem(Type type, String regExpString) {
+ super();
+ this.type = type;
+ List<KEPRuleItemCondition> list = new ArrayList<KEPRuleItemCondition>();
+ list.add(new KEPRuleItemCondition(regExpString));
+ this.conditions.add(list);
+ }
+
+ public KEPRuleItem() {
+ this.type = null;
+ }
+
+ public KEPRuleItem copy() {
+ return new KEPRuleItem(this);
+ }
+
+ public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+ int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex) {
+
+ String mark = "";
+ KEPRule kepRule = (KEPRule) rule;
+ boolean isMarkingItem = type == MLRuleItemType.FILLER && numberInPattern == 0;
+ String cStr = "";
+
+ String anchor = (this.type == null ? "ANY" : this.type.getShortName())
+ + (isStarWildCard ? "*" : "")
+ + ((min == 1 && max == 1) ? "" : ("[" + min + "," + max + "]"))
+ + (isReluctant ? "?" : "") + ((isMarkingItem || !this.conditions.isEmpty()) ? "{" : "");
+
+ if (!this.conditions.isEmpty() && !this.conditions.get(0).isEmpty()) {
+ for (List<KEPRuleItemCondition> cList : this.conditions) {
+ if (cList.size() == 1) {
+ cStr += cList.get(0) + ", ";
+ } else {
+ cStr += "OR(";
+ for (KEPRuleItemCondition condition : cList) {
+ cStr += condition + ", ";
+ }
+ cStr = cStr.substring(0, cStr.lastIndexOf(","));
+ cStr += "), ";
+ }
+ }
+ cStr = cStr.substring(0, cStr.lastIndexOf(","));
+ }
+
+ if (isMarkingItem) {
+ if (kepRule.isCorrectionRule())
+ mark += "->UNMARK(" + kepRule.getMarkName(slotIndex);
+ else
+ mark += "->MARKONCE(" + kepRule.getMarkName(slotIndex);
+ if (patternSize > 1)
+ mark += ", " + (numberInRule + 1) + ", " + (numberInRule + patternSize);
+ mark += ")";
+ }
+ return anchor + cStr + mark + ((isMarkingItem || !this.conditions.isEmpty()) ? "}" : "");
+ }
+
+ @Override
+ public String toString() {
+ return getStringForRuleString(null, null, 0, 0, 0, 0, 0);
+ }
+
+ public boolean isStarWildCard() {
+ return isStarWildCard;
+ }
+
+ public KEPRuleItem setStarWildCard(boolean isStarWildCard) {
+ this.isStarWildCard = isStarWildCard;
+ if (isStarWildCard) {
+ this.min = 1;
+ this.max = 1;
+ }
+ return this;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(Type type) {
+ this.type = type;
+ }
+
+ public boolean equals(TextRulerRuleItem o) {
+ return o.toString().equals(this.toString());
+ }
+
+ public int getBegin() {
+ return this.annotation.getBegin();
+ }
+
+ public int getEnd() {
+ return this.annotation.getEnd();
+ }
+
+ public int getMin() {
+ return min;
+ }
+
+ public KEPRuleItem setMin(int min) {
+ this.min = min;
+ if (min > this.max)
+ this.max = min;
+ isStarWildCard = false;
+ return this;
+ }
+
+ public int getMax() {
+ return max;
+ }
+
+ public KEPRuleItem setMax(int max) {
+ this.max = max;
+ if (max < this.min)
+ this.min = max;
+ isStarWildCard = false;
+ return this;
+ }
+
+ public boolean isReluctant() {
+ return isReluctant;
+ }
+
+ public KEPRuleItem setReluctant(boolean isReluctant) {
+ this.isReluctant = isReluctant;
+ return this;
+ }
+
+ public KEPRuleItem addAndCondition(KEPRuleItemCondition condition) {
+ List<KEPRuleItemCondition> list = new ArrayList<KEPRuleItemCondition>();
+ list.add(condition);
+ this.conditions.add(list);
+ return this;
+ }
+
+ public List<List<KEPRuleItemCondition>> getConditions() {
+ return this.conditions;
+ }
+
+ public void setAnnotation(AnnotationFS afs) {
+ this.annotation = new TextRulerAnnotation(afs);
+ this.type = this.annotation.getType();
+ }
+
+ public KEPRuleItem setConditions(List<List<KEPRuleItemCondition>> conditions) {
+ this.conditions = conditions;
+ return this;
+ }
+
+ public KEPRuleItem addConditions(List<KEPRuleItemCondition> toMerge) {
+ this.conditions.add(toMerge);
+ return this;
+ }
+
+ public boolean containsAndCondition(Type type2) {
+ for (List<KEPRuleItemCondition> list : this.conditions) {
+ if (list.size() == 1)
+ for (KEPRuleItemCondition c : list) {
+ if (c.equals(type2))
+ return true;
+ }
+ }
+ return false;
+ }
+}
\ No newline at end of file
Added: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItemCondition.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItemCondition.java?rev=1491365&view=auto
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItemCondition.java (added)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/kep/KEPRuleItemCondition.java Mon Jun 10 08:05:55 2013
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.ruta.textruler.learner.kep;
+
+import org.apache.uima.cas.Type;
+
+public class KEPRuleItemCondition {
+
+ public enum Condition {
+ IS, PARTOF, CONTAINS, REGEXP
+ }
+
+ private Type type;
+
+ private Condition condition;
+
+ private boolean isNot;
+
+ private String regExp;
+
+ public KEPRuleItemCondition(Type type, Condition condition, boolean isNot) {
+ this.type = type;
+ this.condition = condition;
+ this.isNot = isNot;
+ this.regExp = "";
+ }
+
+ public KEPRuleItemCondition(String regExp) {
+ this.regExp = regExp;
+ this.condition = Condition.REGEXP;
+ this.isNot = false;
+ }
+
+ public String toString() {
+ return (isNot ? "-" : "") + this.condition.toString() + "("
+ + (regExp + type == null ? "" : type.getShortName()) + ")";
+ }
+
+ public boolean equals(KEPRuleItemCondition other) {
+ if (this.condition == Condition.REGEXP && other.condition == Condition.REGEXP
+ && this.regExp.equals(other.regExp) && this.isNot == other.isNot)
+ return true;
+ if (this.type.toString().equals(other.type.toString()) && this.isNot == other.isNot
+ && this.condition == other.condition)
+ return true;
+ return false;
+ }
+
+ public boolean equals(Type type) {
+ if (this.type.toString().equals(type.toString()))
+ return true;
+ return false;
+ }
+}