You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/01 16:21:35 UTC
svn commit: r1152792 [3/10] - in /uima/sandbox/trunk/TextMarker:
org.apache.uima.tm.textruler.lp2/ org.apache.uima.tm.textruler.lp2/META-INF/
org.apache.uima.tm.textruler.lp2/bin/ org.apache.uima.tm.textruler.lp2/src/
org.apache.uima.tm.textruler.lp2/s...
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,714 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.tm.textruler.core.TextRulerAnnotation;
+import org.apache.uima.tm.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.tm.textruler.core.TextRulerExample;
+import org.apache.uima.tm.textruler.core.TextRulerRule;
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem;
+import org.apache.uima.tm.textruler.core.TextRulerRuleList;
+import org.apache.uima.tm.textruler.core.TextRulerRulePattern;
+import org.apache.uima.tm.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.tm.textruler.core.TextRulerTarget;
+import org.apache.uima.tm.textruler.core.TextRulerToolkit;
+import org.apache.uima.tm.textruler.core.TextRulerWordConstraint;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerDelegate;
+
+
+public class Rapier extends TextRulerBasicLearner {
+
+ public final static String COMPRESSION_FAIL_MAX_COUNT_KEY = "compressionFailMaxCount";
+
+ public final static String RULELIST_SIZE_KEY = "ruleListSize";
+
+ public final static String PAIR_COUNT_KEY = "pairCount";
+
+ public final static String LIM_NO_IMPROVEMENTS_KEY = "limNoImprovements";
+
+ public final static String NOISE_THESHOLD_KEY = "noiseThreshold";
+
+ public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";
+
+ public final static String MIN_COVERED_POSITIVES_KEY = "minCoveredPositives";
+
+ public final static String USE_ALL_GENSETS_AT_SPECIALIZATION_KEY = "useAllGenSetsAtSpecialization";
+
+ public final static int STANDARD_COMPRESSION_FAIL_MAX_COUNT = 3;
+
+ public final static int STANDARD_RULELIST_SIZE = 50;
+
+ public final static int STANDARD_PAIR_COUNT = 4;
+
+ public final static int STANDARD_LIM_NO_IMPROVEMENTS = 3;
+
+ public final static float STANDARD_NOISE_THREHSOLD = 0.9f;
+
+ public final static String STANDARD_POSTAG_ROOTTYPE = "de.uniwue.ml.ML.postag";
+
+ public final static int STANDARD_MIN_COVERED_POSITIVES = 1;
+
+ public final static boolean STANDARD_USE_ALL_GENSETS_AT_SPECIALIZATION = true;
+
+ private int compressionFailMaxCount = STANDARD_COMPRESSION_FAIL_MAX_COUNT;
+
+ private int ruleListSize = STANDARD_RULELIST_SIZE;
+
+ private int pairCount = STANDARD_PAIR_COUNT;
+
+ private int limNoImprovements = STANDARD_LIM_NO_IMPROVEMENTS;
+
+ private float noiseThreshold = STANDARD_NOISE_THREHSOLD;
+
+ private String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;
+
+ private int minCoveredPositives = STANDARD_MIN_COVERED_POSITIVES;
+
+ private boolean useAllGenSetsAtSpecialization = STANDARD_USE_ALL_GENSETS_AT_SPECIALIZATION;
+
+ private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();
+
+ private int initialRuleBaseSize;
+
+ private List<TextRulerExample> examples;
+
+ private TextRulerRuleList slotRules;
+
+ private RapierRulePriorityQueue ruleList;
+
+ private String currentSlotName;
+
+ public Rapier(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, delegate);
+ }
+
+ @Override
+ protected void doRun() {
+ for(int i = 0; i < slotNames.length; i++) {
+ int compressionFailCount = 0;
+
+ // only working for one slot yet !
+ currentSlotName = slotNames[i];
+ cachedTestedRuleStatistics.clear();
+ exampleDocuments.createExamplesForTarget(new TextRulerTarget(currentSlotName, this));
+ examples = exampleDocuments.getAllPositiveExamples();
+
+ if (shouldAbort())
+ return;
+
+ slotRules = new TextRulerRuleList();
+ ruleList = new RapierRulePriorityQueue(ruleListSize);
+
+ TextRulerToolkit.log("--- RAPIER START for Slot " + currentSlotName);
+
+ sendStatusUpdateToDelegate("Creating initial rule base...",
+ TextRulerLearnerState.ML_INITIALIZING, false);
+
+ fillSlotRulesWithMostSpecificRules();
+
+ updateCompressionStatusString();
+
+ if (TextRulerToolkit.DEBUG) {
+ slotRules.saveToRulesFile(getIntermediateRulesFileName(), getTMFileHeaderString());
+ }
+
+ while (compressionFailCount < compressionFailMaxCount) {
+ TextRulerToolkit.log("***** NEW COMPRESSION ROUND; FailCount = " + compressionFailCount);
+ if (shouldAbort()) {
+ return;
+ }
+
+ RapierRule bestRule = findNewRule();
+ if (bestRule != null
+ && (bestRule.getCoveringStatistics().getCoveredPositivesCount() >= minCoveredPositives)
+ && (bestRule.noiseValue() >= noiseThreshold) && (!slotRules.contains(bestRule))) {
+ addRuleAndRemoveEmpiricallySubsumedRules(bestRule);
+ if (TextRulerToolkit.DEBUG)
+ slotRules.saveToRulesFile(getIntermediateRulesFileName(), getTMFileHeaderString());
+ } else {
+ compressionFailCount++;
+ }
+ }
+
+ if (TextRulerToolkit.DEBUG) {
+ slotRules.saveToRulesFile(getIntermediateRulesFileName(), getTMFileHeaderString());
+}
+ }
+
+ sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+ cachedTestedRuleStatistics.clear();
+ TextRulerToolkit.log("--- RAPIER END");
+
+ }
+
+ private void updateCompressionStatusString() {
+ double percent = Math.round((slotRules.size() / (double) initialRuleBaseSize) * 100.0);
+ sendStatusUpdateToDelegate("Compressing... (Rules = " + slotRules.size() + "/"
+ + initialRuleBaseSize + " = " + percent + " % ratio)",
+ TextRulerLearnerState.ML_RUNNING, true);
+ // TODO also show round numbers and compression fail count and such
+ // things!
+ }
+
+ private void addAvailablePosTagConstraintToItem(RapierRuleItem item,
+ AnnotationFS tokenAnnotation, TextRulerExample example) {
+
+ if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
+ CAS cas = example.getDocumentCAS();
+ TypeSystem ts = cas.getTypeSystem();
+ Type posTagsRootType = ts.getType(posTagRootTypeName);
+ if (ts != null) {
+ List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
+ tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
+ if (posTagAnnotations.size() > 0) {
+ AnnotationFS posTag = posTagAnnotations.get(0);
+ if (posTag.getBegin() == tokenAnnotation.getBegin()
+ && posTag.getEnd() == tokenAnnotation.getEnd())
+ item.addTagConstraint(posTag.getType().getShortName());
+ }
+ }
+ }
+ }
+
+ private void fillSlotRulesWithMostSpecificRules() {
+ slotRules.clear();
+ for (TextRulerExample example : examples) {
+ RapierRule rule = new RapierRule(this, example.getTarget());
+ TextRulerAnnotation slotAnnotation = example.getAnnotation();
+ CAS docCas = example.getDocumentCAS();
+ TypeSystem ts = docCas.getTypeSystem();
+ Type tokensRootType = ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+
+ // first, get all words/tokens:
+ List<AnnotationFS> before = TextRulerToolkit.getAnnotationsBeforePosition(example
+ .getDocumentCAS(), slotAnnotation.getBegin(), -1, TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
+ List<AnnotationFS> after = TextRulerToolkit.getAnnotationsAfterPosition(example
+ .getDocumentCAS(), slotAnnotation.getEnd(), -1, TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
+ List<AnnotationFS> inside = TextRulerToolkit.getAnnotationsWithinBounds(example
+ .getDocumentCAS(), slotAnnotation.getBegin(), slotAnnotation.getEnd(),
+ TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
+
+ // the before annotations have to be reversed:
+ for (int i = before.size() - 1; i >= 0; i--) {
+ AnnotationFS afs = before.get(i);
+ RapierRuleItem ruleItem = new RapierRuleItem();
+ ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example
+ .getDocument())));
+ addAvailablePosTagConstraintToItem(ruleItem, afs, example);
+ rule.addPreFillerItem(ruleItem);
+ }
+
+ for (AnnotationFS afs : inside) {
+ RapierRuleItem ruleItem = new RapierRuleItem();
+ ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example
+ .getDocument())));
+ addAvailablePosTagConstraintToItem(ruleItem, afs, example);
+ rule.addFillerItem(ruleItem);
+ }
+ for (AnnotationFS afs : after) {
+ RapierRuleItem ruleItem = new RapierRuleItem();
+ ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example
+ .getDocument())));
+ addAvailablePosTagConstraintToItem(ruleItem, afs, example);
+ rule.addPostFillerItem(ruleItem);
+ }
+
+ // TextRulerToolkit.log("RULE: "+rule.getRuleString());
+ // testRuleOnTrainingsSet(rule, exampleDocuments.getDocuments());
+
+ // this rule has to at least cover its seed example!!
+ TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
+ c.addCoveredPositive(example);
+ rule.setCoveringStatistics(c);
+ slotRules.add(rule);
+ }
+ initialRuleBaseSize = slotRules.size();
+ }
+
+ protected void addRuleAndRemoveEmpiricallySubsumedRules(RapierRule rule) {
+ if (!slotRules.contains(rule)) {
+ List<TextRulerRule> rulesToRemove = new ArrayList<TextRulerRule>();
+ Set<TextRulerExample> coveredExamples = rule.getCoveringStatistics()
+ .getCoveredPositiveExamples();
+ for (TextRulerRule r : slotRules) {
+ if (coveredExamples.containsAll(r.getCoveringStatistics().getCoveredPositiveExamples()))
+ rulesToRemove.add(r);
+ }
+ for (TextRulerRule removeR : rulesToRemove)
+ slotRules.remove(removeR);
+ slotRules.add(rule);
+ updateCompressionStatusString();
+ }
+ }
+
+ protected RapierRule findNewRule() {
+ Random rand = new Random(System.currentTimeMillis());
+
+ Set<RapierRule> generalizations = new HashSet<RapierRule>();
+ // 0. initialization
+ ruleList.clear();
+
+ if (slotRules.size() <= 1)
+ return null;
+
+ List<RapierRule> uncompressedRules = new ArrayList<RapierRule>();
+ for (TextRulerRule r : slotRules) {
+ if (((RapierRule) r).isInitialRule())
+ uncompressedRules.add((RapierRule) r);
+ }
+
+ // 1. get generalizations of the two slot filler patterns:
+
+ // create pairs and prefer still uncompressed rules when choosing
+ // "randomly":
+ int pairsLeft = pairCount;
+ if (uncompressedRules.size() == 1) {
+ RapierRule rule1 = uncompressedRules.get(0);
+ RapierRule rule2 = null;
+ while (rule2 == null || rule1 == rule2) {
+ rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
+ }
+ generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
+ if (shouldAbort())
+ return null;
+ pairsLeft--;
+ } else if (uncompressedRules.size() == 2) {
+ RapierRule rule1 = uncompressedRules.get(0);
+ RapierRule rule2 = uncompressedRules.get(1);
+ generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
+ if (shouldAbort())
+ return null;
+ pairsLeft--;
+ } else if (uncompressedRules.size() > 2) {
+ int uPairCount = pairCount;
+ if (uPairCount > uncompressedRules.size())
+ uPairCount /= 2;
+ for (int i = 0; i < uPairCount; i++) {
+ RapierRule rule1 = uncompressedRules.get(rand
+ .nextInt(uncompressedRules.size()));
+ RapierRule rule2 = null;
+ while (rule2 == null || rule1 == rule2) {
+ rule2 = uncompressedRules.get(rand.nextInt(uncompressedRules.size()));
+ }
+ generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
+ pairsLeft--;
+ }
+ }
+
+ for (int i = 0; i < pairsLeft; i++) {
+ // TODO optimize !! don't call the machinery with the same rule pair
+ // two times in one session !!!
+ // randomly pick two rules:
+ RapierRule rule1 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
+ RapierRule rule2 = null;
+ while (rule2 == null || rule1 == rule2) {
+ rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
+ }
+ generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
+
+ if (shouldAbort())
+ return null;
+ }
+
+ // if (TextRulerToolkit.DEBUG)
+ // {
+ // TextRulerToolkit.log("Rule Generalizations created: " +
+ // generalizations.size());
+ // for (RapierRule newRule : generalizations)
+ // TextRulerToolkit.log("Rule = "+newRule.getRuleString());
+ // }
+
+ // 2. evaluate an enque to priority list:
+ List<RapierRule> testRules = new ArrayList<RapierRule>(generalizations);
+
+ for (RapierRule r : testRules) {
+ r.combineSenselessPatternListItems();
+ }
+
+ testRulesIfNotCached(testRules);
+ if (shouldAbort())
+ return null;
+
+ for (RapierRule newRule : generalizations) {
+ if (TextRulerToolkit.DEBUG) {
+ if (!RapierDebugHelper.debugCheckIfRuleCoversItsSeedRuleCoverings(newRule)) {
+ TextRulerToolkit
+ .log("------------------------------------------------------------------------------------------");
+ TextRulerToolkit
+ .log("ERROR, A RULE HAS TO COVER AT LEAST EVERY POSITIVE EXAMPLE OF ITS TWO SEED RULES!!!");
+ TextRulerToolkit.log("\t RULE: " + newRule.getRuleString());
+ TextRulerToolkit.log("\t Parent1: " + newRule.getParent1().getRuleString());
+ TextRulerToolkit.log("\t Parent2: " + newRule.getParent2().getRuleString());
+ TextRulerToolkit.log("--------");
+ TextRulerToolkit.log("+RuleCovering: "
+ + newRule.getCoveringStatistics().getCoveredPositiveExamples());
+ TextRulerToolkit.log("+P1Covering : "
+ + newRule.getParent1().getCoveringStatistics().getCoveredPositiveExamples());
+ TextRulerToolkit.log("+P2Covering : "
+ + newRule.getParent2().getCoveringStatistics().getCoveredPositiveExamples());
+
+ }
+ }
+ ruleList.add(newRule);
+ }
+
+ // 3. specialize pre and post fillers:
+ int n = 0;
+ double bestValue = Double.MAX_VALUE;
+ int noImprovementCounter = 0;
+ while (true) {
+ n++;
+ TextRulerToolkit.log(" --- NEW SPECIALIZATOIN ROUND; n = " + n + " noImprovementCounter = "
+ + noImprovementCounter);
+ List<RapierRule> newRuleList = new ArrayList<RapierRule>();
+ for (RapierRule curRule : ruleList) {
+
+ List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePreFiller(curRule, n));
+
+ for (RapierRule r : specTestRules)
+ r.combineSenselessPatternListItems();
+
+ testRulesIfNotCached(specTestRules);
+ if (shouldAbort())
+ return null;
+
+ for (RapierRule r : specTestRules)
+ newRuleList.add(r);
+ }
+ ruleList.addAll(newRuleList);
+
+ newRuleList.clear();
+ for (RapierRule curRule : ruleList) {
+
+ List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePostFiller(curRule, n));
+
+ for (RapierRule r : specTestRules)
+ r.combineSenselessPatternListItems();
+
+ testRulesIfNotCached(specTestRules);
+ if (shouldAbort())
+ return null;
+
+ for (RapierRule r : specTestRules)
+ newRuleList.add(r);
+ }
+ ruleList.addAll(newRuleList);
+
+ RapierRule bestRule = ruleList.peek();
+
+ if (TextRulerToolkit.DEBUG) {
+ // for (RapierRule r: ruleList)
+ // TextRulerToolkit.log("value="+r.getPriority()+" rule = "+r.getRuleString());
+ TextRulerToolkit.log("------------------------------------");
+ TextRulerToolkit.log("BEST RULE FOR THIS SESSION: " + bestRule.getCoveringStatistics());
+ TextRulerToolkit.log(bestRule.getRuleString());
+ TextRulerToolkit.log("------------------------------------");
+ }
+ if (bestRule.producesOnlyValidFillers())
+ break; // todo: horizon effects ??
+
+ if (bestRule.getPriority() < bestValue) {
+ noImprovementCounter = 0;
+ bestValue = bestRule.getPriority();
+ } else {
+ noImprovementCounter++;
+ if (noImprovementCounter > limNoImprovements)
+ break;
+ }
+ }
+
+ RapierRule bestRule = ruleList.peek();
+ return bestRule;
+ }
+
+ private List<RapierRule> getFillerGeneralizationsForRulePair(RapierRule rule1, RapierRule rule2) {
+ TextRulerToolkit
+ .log("------------------------------------------------------------------------------------------");
+ TextRulerToolkit.log("getFillerGeneralizationsForRulePair:");
+ TextRulerToolkit.log("Rule1: " + rule1.getRuleString());
+ TextRulerToolkit.log("Rule2: " + rule2.getRuleString());
+
+ List<RapierRule> result = new ArrayList<RapierRule>();
+ List<TextRulerRulePattern> genList = RapierGeneralizationHelper
+ .getGeneralizationsForRuleItemPatterns(rule1.getFillerPattern(), rule2
+ .getFillerPattern());
+ // create rules:
+ for (TextRulerRulePattern pattern : genList) {
+ RapierRule newRule = new RapierRule(this, rule1.getTarget());
+ for (TextRulerRuleItem patternItem : pattern)
+ newRule.addFillerItem(patternItem.copy());
+ newRule.setParent1(rule1.copy());
+ newRule.setParent1PreFiller_n(0);
+ newRule.setParent1PostFiller_n(0);
+ newRule.setParent2(rule2.copy());
+ newRule.setParent2PreFiller_n(0);
+ newRule.setParent2PostFiller_n(0);
+ result.add(newRule);
+ newRule.setNeedsCompile(true);
+ // TextRulerToolkit.log("newRule: "+newRule.getRuleString());
+ }
+ TextRulerToolkit.log(" getGeneralizationsForRulePair result list size = " + result.size());
+ return result;
+ }
+
+ public List<RapierRule> specializePreFiller(RapierRule curRule, int n) {
+ RapierRule baseRule1 = curRule.getParent1();
+ RapierRule baseRule2 = curRule.getParent2();
+ int n1 = curRule.getParent1PreFiller_n();
+ int n2 = curRule.getParent2PreFiller_n();
+ TextRulerRulePattern preFiller1 = baseRule1.getPreFillerPattern();
+ TextRulerRulePattern preFiller2 = baseRule2.getPreFillerPattern();
+ int preFiller1MaxIndex = preFiller1.size() - n1 - 1;
+ int preFiller2MaxIndex = preFiller2.size() - n2 - 1;
+
+ // generate 3 different possible sets for generalizations:
+
+ // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2)
+ TextRulerRulePattern consideredPreFiller1 = new TextRulerRulePattern();
+ TextRulerRulePattern consideredPreFiller2 = new TextRulerRulePattern();
+ for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++)
+ consideredPreFiller1.add(preFiller1.get(i));
+ for (int i = preFiller2.size() - n + 1; i >= 0 && i <= preFiller2MaxIndex; i++)
+ consideredPreFiller2.add(preFiller2.get(i));
+ List<TextRulerRulePattern> genList1 = null;
+ if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
+ genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPreFiller1, consideredPreFiller2);
+
+ List<TextRulerRulePattern> genList2 = null;
+ List<TextRulerRulePattern> genList3 = null;
+
+ if (useAllGenSetsAtSpecialization) // due to performance reasons the
+ // user can switch this off
+ {
+ // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2)
+ consideredPreFiller1.clear();
+ consideredPreFiller2.clear();
+ for (int i = preFiller1.size() - n + 1; i >= 0 && i <= preFiller1MaxIndex; i++)
+ consideredPreFiller1.add(preFiller1.get(i));
+ for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++)
+ consideredPreFiller2.add(preFiller2.get(i));
+
+ if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
+ genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPreFiller1, consideredPreFiller2);
+
+ // 3. n vs. n (n elements of baserule1, n of baserule2)
+ consideredPreFiller1.clear();
+ consideredPreFiller2.clear();
+ for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++)
+ consideredPreFiller1.add(preFiller1.get(i));
+ for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++)
+ consideredPreFiller2.add(preFiller2.get(i));
+ if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
+ genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPreFiller1, consideredPreFiller2);
+ }
+
+ // TODO optimize and don't store all 3 genLists ! but for debugging
+ // purposes we keep them for now !
+ Set<TextRulerRulePattern> genSet = new HashSet<TextRulerRulePattern>();
+ if (genList1 != null)
+ genSet.addAll(genList1);
+ if (genList2 != null)
+ genSet.addAll(genList2);
+ if (genList3 != null)
+ genSet.addAll(genList3);
+
+ List<RapierRule> resultRules = new ArrayList<RapierRule>();
+
+ for (TextRulerRulePattern l : genSet) {
+ RapierRule newRule = curRule.copy();
+ for (int i = l.size() - 1; i >= 0; i--)
+ newRule.addPreFillerItem(l.get(i));
+ newRule.setParent1PreFiller_n(n);
+ newRule.setParent2PreFiller_n(n);
+ resultRules.add(newRule);
+ }
+ return resultRules;
+ }
+
+ // n = 1..maxN
+ public List<RapierRule> specializePostFiller(RapierRule curRule, int n) {
+ if (n == 0) {
+ TextRulerToolkit.log("ERROR ! N SHOULD NOT BE 0!");
+ }
+ RapierRule baseRule1 = curRule.getParent1();
+ RapierRule baseRule2 = curRule.getParent2();
+ int n1 = curRule.getParent1PostFiller_n();
+ int n2 = curRule.getParent2PostFiller_n();
+ TextRulerRulePattern postFiller1 = baseRule1.getPostFillerPattern();
+ TextRulerRulePattern postFiller2 = baseRule2.getPostFillerPattern();
+ int postFiller1MinIndex = n1;
+ int postFiller2MinIndex = n2;
+
+ // generate 3 different possible sets for generalizations:
+
+ // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2)
+ TextRulerRulePattern consideredPostFiller1 = new TextRulerRulePattern();
+ TextRulerRulePattern consideredPostFiller2 = new TextRulerRulePattern();
+ for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++)
+ consideredPostFiller1.add(postFiller1.get(i));
+ for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n - 1; i++)
+ consideredPostFiller2.add(postFiller2.get(i));
+ List<TextRulerRulePattern> genList1 = null;
+ if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
+ genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPostFiller1, consideredPostFiller2);
+
+ // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2)
+ consideredPostFiller1.clear();
+ consideredPostFiller2.clear();
+ for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n - 1; i++)
+ consideredPostFiller1.add(postFiller1.get(i));
+ for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++)
+ consideredPostFiller2.add(postFiller2.get(i));
+ List<TextRulerRulePattern> genList2 = null;
+ if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
+ genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPostFiller1, consideredPostFiller2);
+
+ // 3. n vs. n (n elements of baserule1, n of baserule2)
+ consideredPostFiller1.clear();
+ consideredPostFiller2.clear();
+ for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++)
+ consideredPostFiller1.add(postFiller1.get(i));
+ for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++)
+ consideredPostFiller2.add(postFiller2.get(i));
+ List<TextRulerRulePattern> genList3 = null;
+ if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
+ genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
+ consideredPostFiller1, consideredPostFiller2);
+
+ // TODO optimize and don't store all 3 genLists ! but for debugging
+ // purposes we keep them for now !
+ Set<TextRulerRulePattern> genSet = new HashSet<TextRulerRulePattern>();
+ if (genList1 != null)
+ genSet.addAll(genList1);
+ if (genList2 != null)
+ genSet.addAll(genList2);
+ if (genList3 != null)
+ genSet.addAll(genList3);
+
+ List<RapierRule> resultRules = new ArrayList<RapierRule>();
+
+ for (TextRulerRulePattern l : genSet) {
+ RapierRule newRule = curRule.copy();
+ for (TextRulerRuleItem t : l)
+ newRule.addPostFillerItem(t);
+ newRule.setParent1PostFiller_n(n);
+ newRule.setParent2PostFiller_n(n);
+ resultRules.add(newRule);
+ }
+ return resultRules;
+ }
+
+ @Override
+ public boolean collectNegativeCoveredInstancesWhenTesting() {
+ return false;
+ }
+
+ public String getResultString() {
+ if (slotRules != null)
+ return slotRules.getTMFileString(getTMFileHeaderString(), 1000); // if
+ // a
+ // rule
+ // is
+ // >100
+ // characters,
+ // it
+ // gets
+ // replaced
+ // by
+ // a
+ // placeholder
+ else
+ return "No results available yet!";
+ }
+
+ public void setParameters(Map<String, Object> params) {
+ if (TextRulerToolkit.DEBUG)
+ saveParametersToTempFolder(params);
+
+ // TODO add try catch
+ if (params.containsKey(COMPRESSION_FAIL_MAX_COUNT_KEY))
+ compressionFailMaxCount = (Integer) params.get(COMPRESSION_FAIL_MAX_COUNT_KEY);
+
+ if (params.containsKey(RULELIST_SIZE_KEY))
+ ruleListSize = (Integer) params.get(RULELIST_SIZE_KEY);
+
+ if (params.containsKey(PAIR_COUNT_KEY))
+ pairCount = (Integer) params.get(PAIR_COUNT_KEY);
+
+ if (params.containsKey(LIM_NO_IMPROVEMENTS_KEY))
+ limNoImprovements = (Integer) params.get(LIM_NO_IMPROVEMENTS_KEY);
+
+ if (params.containsKey(NOISE_THESHOLD_KEY))
+ noiseThreshold = (Float) params.get(NOISE_THESHOLD_KEY);
+
+ if (params.containsKey(POSTAG_ROOTTYPE_KEY))
+ posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);
+
+ if (params.containsKey(MIN_COVERED_POSITIVES_KEY))
+ minCoveredPositives = (Integer) params.get(MIN_COVERED_POSITIVES_KEY);
+
+ if (params.containsKey(USE_ALL_GENSETS_AT_SPECIALIZATION_KEY))
+ useAllGenSetsAtSpecialization = (Boolean) params.get(USE_ALL_GENSETS_AT_SPECIALIZATION_KEY);
+ }
+
+ // TODO share this between algorithms (e.g. LP2 and RAPIER ?) and make a
+ // maximum size of the cache, etc. like CasCache?
+ protected void testRulesIfNotCached(List<RapierRule> rules) {
+ List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
+
+ for (RapierRule r : rules) {
+ String key = r.getRuleString();
+ if (cachedTestedRuleStatistics.containsKey(key)) {
+ r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
+ TextRulerToolkit.log("CACHE HIT; size=" + cachedTestedRuleStatistics.size());
+ } else
+ rulesToTest.add(r);
+ }
+
+ if (rulesToTest.size() > 0) {
+ testRulesOnDocumentSet(rulesToTest, exampleDocuments);
+ if (shouldAbort())
+ return;
+ while (cachedTestedRuleStatistics.size() + rulesToTest.size() > 10000) // TODO
+ // lohnt
+ // sich
+ // das
+ // ?
+ // speicher
+ // beobachten
+ // !!
+ {
+ Iterator<String> it = cachedTestedRuleStatistics.keySet().iterator();
+ if (!it.hasNext())
+ break;
+ String removeKey = cachedTestedRuleStatistics.keySet().iterator().next();
+ cachedTestedRuleStatistics.remove(removeKey);
+ }
+
+ for (TextRulerRule r : rulesToTest) {
+ String key = r.getRuleString();
+ cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
+ }
+ }
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/Rapier.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,19 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import java.util.Set;
+
+import org.apache.uima.tm.textruler.core.TextRulerExample;
+
+public class RapierDebugHelper {
+
+ public static boolean debugCheckIfRuleCoversItsSeedRuleCoverings(RapierRule rule) {
+ Set<TextRulerExample> parent1Positives = rule.getParent1().getCoveringStatistics()
+ .getCoveredPositiveExamples();
+ Set<TextRulerExample> parent2Positives = rule.getParent2().getCoveringStatistics()
+ .getCoveredPositiveExamples();
+ Set<TextRulerExample> rulePositives = rule.getCoveringStatistics().getCoveredPositiveExamples();
+ return rulePositives.containsAll(parent1Positives)
+ && rulePositives.containsAll(parent2Positives);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierDebugHelper.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,57 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.tm.textruler.extension.TextRulerLearner;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerParameter;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerParameter.MLAlgorithmParamType;
+
+public class RapierFactory implements TextRulerLearnerFactory {
+
+ public TextRulerLearner createAlgorithm(String inputFolderPath, String additionalFolderPath,
+ String prePropTMFile, String tempFolderPath, String[] fullSlotTypeNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ return new Rapier(inputFolderPath, prePropTMFile, tempFolderPath, fullSlotTypeNames, filterSet,
+ delegate);
+ }
+
+ public TextRulerLearnerParameter[] getAlgorithmParameters() {
+ TextRulerLearnerParameter[] result = new TextRulerLearnerParameter[8];
+
+ result[0] = new TextRulerLearnerParameter(Rapier.COMPRESSION_FAIL_MAX_COUNT_KEY,
+ "Maximum Compression Fail Count", MLAlgorithmParamType.ML_INT_PARAM);
+ result[1] = new TextRulerLearnerParameter(Rapier.RULELIST_SIZE_KEY, "Internal Rules List Size",
+ MLAlgorithmParamType.ML_INT_PARAM);
+ result[2] = new TextRulerLearnerParameter(Rapier.PAIR_COUNT_KEY, "Rule Pairs for Generalizing",
+ MLAlgorithmParamType.ML_INT_PARAM);
+ result[3] = new TextRulerLearnerParameter(Rapier.LIM_NO_IMPROVEMENTS_KEY,
+ "Maximum 'No improvement' Count", MLAlgorithmParamType.ML_INT_PARAM);
+ result[4] = new TextRulerLearnerParameter(Rapier.NOISE_THESHOLD_KEY, "Maximum Noise Threshold",
+ MLAlgorithmParamType.ML_FLOAT_PARAM);
+ result[5] = new TextRulerLearnerParameter(Rapier.MIN_COVERED_POSITIVES_KEY,
+ "Minimum Covered Positives Per Rule", MLAlgorithmParamType.ML_INT_PARAM);
+ result[6] = new TextRulerLearnerParameter(Rapier.POSTAG_ROOTTYPE_KEY, "PosTag Root Type",
+ MLAlgorithmParamType.ML_STRING_PARAM);
+ result[7] = new TextRulerLearnerParameter(Rapier.USE_ALL_GENSETS_AT_SPECIALIZATION_KEY,
+ "Use All 3 GenSets at Specialization", MLAlgorithmParamType.ML_BOOL_PARAM);
+ return result;
+ }
+
+ public Map<String, Object> getAlgorithmParameterStandardValues() {
+ Map<String, Object> result = new HashMap<String, Object>();
+ result.put(Rapier.COMPRESSION_FAIL_MAX_COUNT_KEY, Rapier.STANDARD_COMPRESSION_FAIL_MAX_COUNT);
+ result.put(Rapier.RULELIST_SIZE_KEY, Rapier.STANDARD_RULELIST_SIZE);
+ result.put(Rapier.PAIR_COUNT_KEY, Rapier.STANDARD_PAIR_COUNT);
+ result.put(Rapier.LIM_NO_IMPROVEMENTS_KEY, Rapier.STANDARD_LIM_NO_IMPROVEMENTS);
+ result.put(Rapier.NOISE_THESHOLD_KEY, Rapier.STANDARD_NOISE_THREHSOLD);
+ result.put(Rapier.POSTAG_ROOTTYPE_KEY, Rapier.STANDARD_POSTAG_ROOTTYPE);
+ result.put(Rapier.MIN_COVERED_POSITIVES_KEY, Rapier.STANDARD_MIN_COVERED_POSITIVES);
+ result.put(Rapier.USE_ALL_GENSETS_AT_SPECIALIZATION_KEY,
+ Rapier.STANDARD_USE_ALL_GENSETS_AT_SPECIALIZATION);
+ return result;
+ }
+}
\ No newline at end of file
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,673 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem;
+import org.apache.uima.tm.textruler.core.TextRulerRulePattern;
+import org.apache.uima.tm.textruler.core.TextRulerToolkit;
+import org.apache.uima.tm.textruler.core.TextRulerWordConstraint;
+
+public class RapierGeneralizationHelper {
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- ITEM(s) GENERALIZATION
+ // -------------------------------------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ private static ArrayList<TextRulerRuleItem> getGeneralizationsForRuleItems(
+ TextRulerRuleItem item1, TextRulerRuleItem item2) {
+ ArrayList<TextRulerRuleItem> item1List = new ArrayList<TextRulerRuleItem>();
+ ArrayList<TextRulerRuleItem> item2List = new ArrayList<TextRulerRuleItem>();
+ if (item1 != null)
+ item1List.add(item1);
+ if (item2 != null)
+ item2List.add(item2);
+ return getGeneralizationsForRuleItemLists(item1List, item2List);
+ }
+
+ // generalize two given pattern elements or pattern element lists and return
+ // all possible generalizations as
+ // TextRulerRuleItem objects
+ private static ArrayList<TextRulerRuleItem> getGeneralizationsForRuleItemLists(
+ ArrayList<TextRulerRuleItem> item1List, ArrayList<TextRulerRuleItem> item2List) {
+ ArrayList<RapierRuleItem> proposedWordConstraints = new ArrayList<RapierRuleItem>();
+ ArrayList<RapierRuleItem> proposedTagConstraints = new ArrayList<RapierRuleItem>();
+ ArrayList<RapierRuleItem> proposedClassConstraints = new ArrayList<RapierRuleItem>();
+ ArrayList<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();
+
+ int resultListLen1 = 0;
+ int resultListLen2 = 0;
+ boolean oneListIsEmpty = false;
+
+ if (item1List.size() == 0 && item2List.size() == 0) {
+ TextRulerToolkit.log("ERROR !");
+ }
+ if (item1List.size() == 0 || item2List.size() == 0) {
+ // TextRulerToolkit.log("SPECIAL CASE WITH ONE LIST OF ZERO SIZE");
+ oneListIsEmpty = true;
+ }
+
+ boolean hasEmptyWordList = false;
+ int maxWordCount = 0;
+ boolean hasEmptyTagList = false;
+ int maxTagCount = 0;
+ for (TextRulerRuleItem rt : item2List) {
+ RapierRuleItem t = (RapierRuleItem) rt;
+ resultListLen2 += t.isListItem() ? t.listLen() : 1;
+ if (t.getWordConstraints().size() > maxWordCount)
+ maxWordCount = t.getWordConstraints().size();
+ if (t.getWordConstraints().size() == 0)
+ hasEmptyWordList = true;
+ if (t.getTagConstraints().size() > maxTagCount)
+ maxTagCount = t.getTagConstraints().size();
+ if (t.getTagConstraints().size() == 0)
+ hasEmptyTagList = true;
+ }
+ for (TextRulerRuleItem rt : item1List) {
+ RapierRuleItem t = (RapierRuleItem) rt;
+ resultListLen1 += t.isListItem() ? t.listLen() : 1;
+ if (t.getWordConstraints().size() > maxWordCount)
+ maxWordCount = t.getWordConstraints().size();
+ if (t.getWordConstraints().size() == 0)
+ hasEmptyWordList = true;
+ if (t.getTagConstraints().size() > maxTagCount)
+ maxTagCount = t.getTagConstraints().size();
+ if (t.getTagConstraints().size() == 0)
+ hasEmptyTagList = true;
+ }
+ int resultListLen = resultListLen1 > resultListLen2 ? resultListLen1 : resultListLen2; // take
+ // the
+ // bigger
+ // of
+ // both
+ if (resultListLen == 1 && !oneListIsEmpty)
+ resultListLen = 0; // lists with a length of 1 can only occur when
+ // one itemList is empty! THAT CANNOT HAPPEN
+ // HERE!
+
+ // generalize word constraints:
+ if (hasEmptyWordList) // at least one constraint of both is empty
+ {
+ // do nothing here, proposed.wordItems stays empty
+ proposedWordConstraints.add(new RapierRuleItem());
+ } else // create union of both constraints AND (if both constraints
+ // weren't the same) drop constraint
+ {
+ RapierRuleItem proposed = new RapierRuleItem();
+ for (TextRulerRuleItem t : item1List)
+ proposed.addWordConstraints(((RapierRuleItem) t).getWordConstraints());
+ for (TextRulerRuleItem t : item2List)
+ proposed.addWordConstraints(((RapierRuleItem) t).getWordConstraints());
+
+ proposedWordConstraints.add(proposed);
+
+ // if the union of both constraints is a real union (one does not
+ // subsume the other completely),
+ // we have to add the DROPPING OF THE CONSTRAINT as a second
+ // proposed word constraint
+ if (maxWordCount != proposed.getWordConstraints().size()) // the
+ // union
+ // is a
+ // real
+ // bigger
+ // set
+ // than
+ {
+ proposedWordConstraints.add(new RapierRuleItem());
+ }
+ }
+
+ if (hasEmptyTagList) // at least one constraint of both is empty
+ {
+ // do nothing here, proposed.tagItems stays empty
+ proposedTagConstraints.add(new RapierRuleItem());
+ } else // create union of both constraints AND (if both constraints
+ // weren't the same) drop constraint
+ {
+ RapierRuleItem proposed = new RapierRuleItem();
+ for (TextRulerRuleItem t : item1List)
+ proposed.addTagConstraints(((RapierRuleItem) t).getTagConstraints());
+ for (TextRulerRuleItem t : item2List)
+ proposed.addTagConstraints(((RapierRuleItem) t).getTagConstraints());
+
+ proposedTagConstraints.add(proposed);
+
+ // if the union of both constraints is a real union (one does not
+ // subsume the other completely),
+ // we have to add the DROPPING OF THE CONSTRAINT as a second
+ // proposed tag constraint
+ if (maxTagCount != proposed.getTagConstraints().size()) // the union
+ // is a real
+ // bigger
+ // set than
+ {
+ proposedTagConstraints.add(new RapierRuleItem());
+ }
+ }
+
+ // TODO semantic class generalization
+ proposedClassConstraints.add(new RapierRuleItem()); // add only NO
+ // class
+ // constraint
+ // version for
+ // now!
+
+ // finally, create all combinations of the above proposed items
+ for (RapierRuleItem wt : proposedWordConstraints) {
+ for (RapierRuleItem tt : proposedTagConstraints) {
+ for (RapierRuleItem ct : proposedClassConstraints) {
+ RapierRuleItem newItem = new RapierRuleItem();
+ for (TextRulerWordConstraint wi : wt.getWordConstraints())
+ newItem.addWordConstraint(wi.copy());
+ for (String ti : tt.getTagConstraints())
+ newItem.addTagConstraint(ti);
+ for (String tc : ct.getClassConstraints())
+ newItem.addClassConstraint(tc);
+ newItem.setListLen(resultListLen);
+ newItem.setListBeginsAtZero(oneListIsEmpty && resultListLen > 0);
+ result.add(newItem);
+ }
+ }
+ }
+ return result;
+ }
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- EQUAL SIZE PATTERN GENERALIZATION
+ // --------------------------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ private static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatternsOfEqualSize(
+ TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
+ ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
+
+ ArrayList<ArrayList<TextRulerRuleItem>> generalizationTable = new ArrayList<ArrayList<TextRulerRuleItem>>();
+ Iterator<TextRulerRuleItem> it2 = pattern2.iterator();
+ if (pattern1.size() != pattern2.size()) {
+ TextRulerToolkit.log("ERROR!");
+ }
+ for (TextRulerRuleItem item1 : pattern1) {
+ TextRulerRuleItem item2 = it2.next();
+ // get all possible LGGs of the current two elements and save them
+ // into the matrix
+ ArrayList<TextRulerRuleItem> allLGGs = getGeneralizationsForRuleItems(item1, item2);
+ generalizationTable.add(allLGGs);
+ // TextRulerToolkit.log("--- GET GENERALISATIONS FOR TWO TERMS: --"+t1+"-- --"+t2+"--");
+ // ArrayList<MLRapierRuleTerm> allLGGs =
+ // this.getGeneralizationsForRuleTerms(t1, t2);
+ // for (MLRapierRuleTerm term : allLGGs)
+ // TextRulerToolkit.log("--- "+term);
+ // TextRulerToolkit.log("--- END");
+ }
+
+ // now we have patternSize lists of possible generalizations, one list
+ // per original pattern item pair of
+ // pattern1 and pattern2. we now have to build all possible
+ // combinations. Each combination is a
+ // new pattern
+ recursiveBuildAllRuleItemCombinations(generalizationTable, 0, new TextRulerRulePattern(),
+ resultList);
+ return resultList;
+ }
+
+ private static void recursiveBuildAllRuleItemCombinations(
+ ArrayList<ArrayList<TextRulerRuleItem>> table, int curIndex,
+ TextRulerRulePattern currentPattern, ArrayList<TextRulerRulePattern> resultPatterns) {
+ if (curIndex >= table.size()) {
+ // make a deep copy of the current pattern:
+ TextRulerRulePattern copy = new TextRulerRulePattern();
+ for (TextRulerRuleItem item : currentPattern)
+ copy.add(item.copy());
+ resultPatterns.add(copy);
+ } else {
+ for (TextRulerRuleItem item : table.get(curIndex)) {
+ currentPattern.add(item);
+ recursiveBuildAllRuleItemCombinations(table, curIndex + 1, currentPattern, resultPatterns);
+ currentPattern.remove(currentPattern.size() - 1);
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- DISTINCT SIZE PATTERN GENERALIZATION
+ // -----------------------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+
+ // this is the low level version for patterns of distinct sizes! it creates
+ // ALL possible combinations how to pair/map pattern items
+ // of the shorter with those of the longer pattern. the optimizing version
+ // (getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize)
+ // uses this method for its pattern segments! (see below)
+ private static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatternsOfDistinctSize(
+ TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
+ ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
+ if (pattern1.size() == pattern2.size()) {
+ TextRulerToolkit.log("ERROR! CALL getGeneralizationsForRuleItemPatternsOfEqualSize instead!");
+ if (TextRulerToolkit.DEBUG)
+ return null;
+ }
+
+ TextRulerRulePattern longerPattern = pattern1;
+ TextRulerRulePattern shorterPattern = pattern2;
+ if (pattern2.size() > pattern1.size()) {
+ longerPattern = pattern2;
+ shorterPattern = pattern1;
+ }
+
+ if (longerPattern.size() <= 1 || shorterPattern.size() <= 1) {
+ // Special case 1: one of the pattern terms list is empty AND
+ // special case 2: one has only ONE element
+ if (longerPattern.size() + shorterPattern.size() == 0) {
+ TextRulerToolkit.log("ERROR !! BOTH PATTERNS ARE EMPTY!");
+ if (TextRulerToolkit.DEBUG)
+ return null;
+ }
+
+ // get all possible generalizations of the two patterns. result of
+ // each generalization is ONE rule item, so we
+ // don't use TextRulerRulePattern here since this IS NOT a rule
+ // pattern! it's a list of possible generalizations:
+ ArrayList<TextRulerRuleItem> generalizations = getGeneralizationsForRuleItemLists(
+ longerPattern, shorterPattern);
+ // create a one element result pattern for each:
+ for (TextRulerRuleItem item : generalizations) {
+ TextRulerRulePattern p = new TextRulerRulePattern();
+ p.add(item);
+ resultList.add(p);
+ }
+ }
+ // else SPECIAL CASE 3 // TODO make those values configurable ?
+ else if (((longerPattern.size() - shorterPattern.size()) > 6) || (longerPattern.size() > 10)) {
+ int resultListLen1 = 0;
+ for (TextRulerRuleItem rt : shorterPattern)
+ resultListLen1 += ((RapierRuleItem) rt).isListItem() ? ((RapierRuleItem) rt).listLen() : 1;
+ int resultListLen2 = 0;
+ for (TextRulerRuleItem rt : longerPattern)
+ resultListLen2 += ((RapierRuleItem) rt).isListItem() ? ((RapierRuleItem) rt).listLen() : 1;
+
+ RapierRuleItem singleItem = new RapierRuleItem();
+ singleItem.setListLen(resultListLen1 > resultListLen2 ? resultListLen1 : resultListLen2);
+ TextRulerRulePattern singleItemPattern = new TextRulerRulePattern();
+ singleItemPattern.add(singleItem);
+ resultList.add(singleItemPattern);
+ } else { // sizes are different and both > 1
+ // create all possible generalization combinations, that is: how can
+ // we map elements of the shorter pattern
+ // to the ones of the longer pattern and then generalize each
+ // mapping (each group) of items?
+ ArrayList<ArrayList<RapierPatternItemMapping>> combinationList = new ArrayList<ArrayList<RapierPatternItemMapping>>();
+
+ recursiveBuildAllPossiblePatternMappingSequences(longerPattern, shorterPattern,
+ new ArrayList<RapierPatternItemMapping>(), combinationList);
+
+ for (ArrayList<RapierPatternItemMapping> mappingSequence : combinationList) {
+ resultList.addAll(getGeneralizationsForPatternMappingSequence(mappingSequence));
+ }
+ }
+ return resultList;
+ }
+
+ // creates all possible combinations how to pair together items from the
+ // longer and the shorter source pattern, e.g.
+ // 1 2 3 4 5 vs. 1 2 3 = 1/1+2+3 2/4 3/5, ...
+ private static void recursiveBuildAllPossiblePatternMappingSequences(
+ TextRulerRulePattern longerPattern, TextRulerRulePattern shorterPattern,
+ ArrayList<RapierPatternItemMapping> currentMappingSequence,
+ ArrayList<ArrayList<RapierPatternItemMapping>> resultList) {
+ int windowSize = longerPattern.size() - shorterPattern.size() + 1;
+
+ if (shorterPattern.size() > longerPattern.size()) {
+ TextRulerToolkit.log("ERROR: SHORTER > LONGER !!");
+ }
+ if (longerPattern.size() == 0 || shorterPattern.size() == 0) {
+ TextRulerToolkit.log("ERROR: SHORTER == LONGER == 0!");
+ } else {
+ // if the remaining (sub-)patterns are of equal size or one has only
+ // one element left, create one last item mapping and
+ // a final result mapping sequence:
+ if (shorterPattern.size() == 1 || (longerPattern.size() == shorterPattern.size())) {
+ RapierPatternItemMapping lastMapping = new RapierPatternItemMapping();
+ lastMapping.shorterPattern.addAll(shorterPattern);
+ lastMapping.longerPattern.addAll(longerPattern);
+ ArrayList<RapierPatternItemMapping> newMappingSequence = new ArrayList<RapierPatternItemMapping>();
+ newMappingSequence.addAll(currentMappingSequence);
+ newMappingSequence.add(lastMapping);
+ resultList.add(newMappingSequence);
+ } else { // otherwise we have to create all possible combinations of
+ // the longer and shorter remaining pattern:
+ TextRulerRuleItem firstItem = shorterPattern.get(0);
+ // combine with 0, 0/1, ... 0/1/2/.../windowSize-1
+ for (int maxi = 0; maxi < windowSize; maxi++) {
+ RapierPatternItemMapping newMapping = new RapierPatternItemMapping();
+ newMapping.shorterPattern.add(firstItem);
+ for (int li = 0; li <= maxi; li++)
+ newMapping.longerPattern.add(longerPattern.get(li));
+ currentMappingSequence.add(newMapping);
+ TextRulerRulePattern restLongerPattern = new TextRulerRulePattern();
+ TextRulerRulePattern restShorterPattern = new TextRulerRulePattern();
+ for (int i = 1; i < shorterPattern.size(); i++)
+ restShorterPattern.add(shorterPattern.get(i));
+ for (int i = maxi + 1; i < longerPattern.size(); i++)
+ restLongerPattern.add(longerPattern.get(i));
+
+ // recurse:
+ recursiveBuildAllPossiblePatternMappingSequences(restLongerPattern, restShorterPattern,
+ currentMappingSequence, resultList);
+
+ // remove last segment to get back to the same state as
+ // before the recursion:
+ currentMappingSequence.remove(currentMappingSequence.size() - 1);
+ }
+ }
+ }
+ }
+
+ // here the input is called a MAPPING instead of a pattern segmentation in
+ // order to distinguish between the two levels of
+ // dividing the problem: a pattern segmentation is a special mapping of
+ // equal items in the two to generalize source patterns;
+ // the segments that result through that segmentation still need to be
+ // generalized (see getGeneralizationsForPatternSegmentation)
+ // if sucha semgent has subpattersn of different size,
+ // getGeneralizationsForRuleItemPatternsOfDistinctSize is used to
+ // generalize it, which uses THIS METHOD HERE to get all generalizations for
+ // a special MAPPING. a mapping (in comparison to the
+ // segmentation!) is a mapping between the longer and shorter pattern items
+ // which then get directly generalized here!
+ // in order to show this difference, we use the (inernally exactly the
+ // same!) class RapierPatternItemMapping instead of
+ // RapierPatternSegment!)
+ private static ArrayList<TextRulerRulePattern> getGeneralizationsForPatternMappingSequence(
+ ArrayList<RapierPatternItemMapping> patternMappingSequence) {
+ ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
+ ArrayList<ArrayList<TextRulerRuleItem>> generalizationTable = new ArrayList<ArrayList<TextRulerRuleItem>>();
+
+ // every mapping has several possible generalizations, so we store all
+ // of them in that generalizationTable, one list of
+ // generalizations for each mapping:
+ for (RapierPatternItemMapping mapping : patternMappingSequence) {
+ ArrayList<TextRulerRuleItem> lggList = getGeneralizationsForRuleItemLists(
+ mapping.shorterPattern, mapping.longerPattern);
+ generalizationTable.add(lggList);
+ }
+
+ // afterwards we have again to create all possible combinations of those
+ // lists (like in the equalSizeGeneralization):
+ // Each combination is a new pattern
+ recursiveBuildAllRuleItemCombinations(generalizationTable, 0, new TextRulerRulePattern(),
+ resultList);
+ return resultList;
+ }
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- FIND MATCHINGS BETWEEN PATTERNS FOR GENERALIZATION
+ // -----------------------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+
+ // find matches in two patterns by comparing the items of the patterns and
+ // return all possible segmentations of those two patterns.
+ private static void recursiveFindPatternSegmentsByMatchingPatternItems(
+ TextRulerRulePattern longerPattern, TextRulerRulePattern shorterPattern,
+ ArrayList<RapierPatternSegment> currentSegmentation,
+ ArrayList<ArrayList<RapierPatternSegment>> resultList) {
+ int cmpWindowSize = longerPattern.size() - shorterPattern.size() + 1;
+
+ // is one
+ // (rest-)pattern
+ // empty ?
+ if (longerPattern.size() == 0 || shorterPattern.size() == 0) {
+ // create result segmentation
+ ArrayList<RapierPatternSegment> newSegmentation = new ArrayList<RapierPatternSegment>();
+ newSegmentation.addAll(currentSegmentation); // add current
+ // add rest if
+ // recursive state
+ // anything is
+ // left one of
+ // the
+ // patterns
+ if (longerPattern.size() + shorterPattern.size() > 0) {
+ RapierPatternSegment lastSegment = new RapierPatternSegment();
+ for (TextRulerRuleItem i : shorterPattern)
+ lastSegment.shorterPattern.add(i);
+ for (TextRulerRuleItem i : longerPattern)
+ lastSegment.longerPattern.add(i);
+ newSegmentation.add(lastSegment);
+ }
+ resultList.add(newSegmentation);
+ } else {
+ boolean matched = false;
+ for (int si = 0; si < shorterPattern.size(); si++) {
+ // compare element si with si, si+1, ... si+cmpWindowSize-1
+ for (int li = si; li < si + cmpWindowSize; li++) {
+ if (longerPattern.get(li).equals(shorterPattern.get(si))) {
+ // matched pair found!
+ matched = true;
+ // --> calculate pattern segments, add them to the
+ // current one and pass the rest of the
+ // patterns to the next recursion level:
+ RapierPatternSegment newSegment = new RapierPatternSegment();
+ for (int i = 0; i < si; i++)
+ newSegment.shorterPattern.add(shorterPattern.get(i));
+ for (int i = 0; i < li; i++)
+ newSegment.longerPattern.add(longerPattern.get(i));
+
+ boolean addedLeftSegmentation = false;
+ if (newSegment.longerPattern.size() > 0 || newSegment.shorterPattern.size() > 0) {
+ // only add if the segmentation is not empty!
+ currentSegmentation.add(newSegment);
+ addedLeftSegmentation = true;
+ }
+
+ RapierPatternSegment matchedSegment = new RapierPatternSegment();
+ matchedSegment.shorterPattern.add(shorterPattern.get(si));
+ matchedSegment.longerPattern.add(longerPattern.get(li));
+ currentSegmentation.add(matchedSegment);
+
+ // the rest is now the rest to the right of both (so
+ // li+1 and si+1 to the ends...)
+ TextRulerRulePattern restLongerPattern = new TextRulerRulePattern();
+ TextRulerRulePattern restShorterPattern = new TextRulerRulePattern();
+ for (int i = li + 1; i < longerPattern.size(); i++)
+ restLongerPattern.add(longerPattern.get(i));
+ for (int i = si + 1; i < shorterPattern.size(); i++)
+ restShorterPattern.add(shorterPattern.get(i));
+
+ // recurse...
+ if (restLongerPattern.size() > restShorterPattern.size())
+ recursiveFindPatternSegmentsByMatchingPatternItems(restLongerPattern,
+ restShorterPattern, currentSegmentation, resultList);
+ else
+ recursiveFindPatternSegmentsByMatchingPatternItems(restShorterPattern,
+ restLongerPattern, currentSegmentation, resultList);
+
+ // remove added segments so that we are in the same
+ // state as before the recursion:
+ if (addedLeftSegmentation)
+ currentSegmentation.remove(currentSegmentation.size() - 1); // remove the left side
+ // segment
+ currentSegmentation.remove(currentSegmentation.size() - 1); // remove
+ // the
+ // matched
+ // segment
+ }
+ }
+ }
+ if (!matched) // add remaining items of both lists in one pattern
+ // segment
+ {
+ ArrayList<RapierPatternSegment> newSegmentation = new ArrayList<RapierPatternSegment>();
+ newSegmentation.addAll(currentSegmentation);
+
+ RapierPatternSegment lastSegment = new RapierPatternSegment();
+ for (TextRulerRuleItem i : shorterPattern)
+ lastSegment.shorterPattern.add(i);
+ for (TextRulerRuleItem i : longerPattern)
+ lastSegment.longerPattern.add(i);
+ newSegmentation.add(lastSegment);
+ resultList.add(newSegmentation);
+ }
+ }
+ }
+
+ private static ArrayList<TextRulerRulePattern> getGeneralizationsForPatternSegmentation(
+ ArrayList<RapierPatternSegment> patternSegmentation) {
+ // for creating those, we need a table:
+ // each segment of the patternSegmentation creates a bunch of possible
+ // new generalized sub patterns (that's the inner
+ // ArrayList<TextRulerRulePattern>)
+ // since we have a whole sequence of pattern semgents (a whole
+ // segmentation), we need the outer ArrayList to collect
+ // all generalizations of all pattern segments:
+ ArrayList<ArrayList<TextRulerRulePattern>> generalizationTable = new ArrayList<ArrayList<TextRulerRulePattern>>();
+
+ // now, we create all generalizations of each pattern segment and
+ // collect them in that table:
+ for (RapierPatternSegment pSeg : patternSegmentation) {
+ ArrayList<TextRulerRulePattern> pSegGeneralizations;
+
+ if (pSeg.longerPattern.size() == pSeg.shorterPattern.size())
+ pSegGeneralizations = getGeneralizationsForRuleItemPatternsOfEqualSize(pSeg.longerPattern,
+ pSeg.shorterPattern);
+ else
+ pSegGeneralizations = getGeneralizationsForRuleItemPatternsOfDistinctSize(
+ pSeg.longerPattern, pSeg.shorterPattern);
+
+ generalizationTable.add(pSegGeneralizations);
+ }
+
+ // finally, we have to build all combinations of them in form of
+ // MLRulePatterns:
+ ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>(); // the
+ // result
+ // is
+ // a
+ // list
+ // of
+ // new
+ // generalized
+ // patterns
+
+ recursiveBuildAllRuleItemCombinationsFromPatterns(generalizationTable, 0,
+ new TextRulerRulePattern(), resultList);
+
+ return resultList;
+ }
+
+ private static void recursiveBuildAllRuleItemCombinationsFromPatterns(
+ ArrayList<ArrayList<TextRulerRulePattern>> table, int curIndex,
+ TextRulerRulePattern currentPattern, ArrayList<TextRulerRulePattern> resultPatterns) {
+ if (curIndex >= table.size()) {
+ // make a deep copy of the current pattern:
+ TextRulerRulePattern copy = new TextRulerRulePattern();
+ for (TextRulerRuleItem item : currentPattern)
+ copy.add(item.copy());
+ resultPatterns.add(copy);
+ } else {
+ for (TextRulerRulePattern pattern : table.get(curIndex)) {
+ currentPattern.addAll(pattern);
+ recursiveBuildAllRuleItemCombinationsFromPatterns(table, curIndex + 1, currentPattern,
+ resultPatterns);
+ for (int i = 0; i < pattern.size(); i++)
+ currentPattern.remove(currentPattern.size() - 1);
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- WORKING ON PATTERNS OF DISTINCT LENGTH - OPTIMIZED
+ // ---------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+
+ // "optimized", because this method uses the pattern item matching
+ // optimization (search for equal items and make a segmentation, etc.)
+ private static ArrayList<TextRulerRulePattern> getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize(
+ TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
+ ArrayList<ArrayList<RapierPatternSegment>> matchedCombinationList = new ArrayList<ArrayList<RapierPatternSegment>>();
+
+ // in order to reduce the amount of possible combinations how to combine
+ // the elements of the shorter pattern with the
+ // elements of the longer pattern, we first search for equal pattern
+ // items in both patterns. those equal patterns get hardwired
+ // in the combination and the left and right remaining subpatterns stay
+ // as a "divided smaller problem" that needs to be conquered...
+ // the result of the search is a list of possible segmentations of the
+ // patterns. all semgementations are those "smaller"
+ // left problems that we then need to generalize in the original manner.
+ // if no equal items are found, one segmentation with only one segment
+ // (the orignal longer and shoter pattern) is returned and
+ // has to be generalized.
+ if (pattern1.size() > pattern2.size())
+ recursiveFindPatternSegmentsByMatchingPatternItems(pattern1, pattern2,
+ new ArrayList<RapierPatternSegment>(), matchedCombinationList);
+ else
+ recursiveFindPatternSegmentsByMatchingPatternItems(pattern2, pattern1,
+ new ArrayList<RapierPatternSegment>(), matchedCombinationList);
+
+ // if (TextRulerToolkit.DEBUG && matchedCombinationList.size() > 1)
+ // {
+ // TextRulerToolkit.log("PATTERN SEQUENCES FOUND: "+matchedCombinationList.size());
+ // for (ArrayList<RapierPatternSegment> patternSequence :
+ // matchedCombinationList)
+ // {
+ // TextRulerToolkit.log("\tNEXT SEQUENCE");
+ // for (RapierPatternSegment pSeg : patternSequence)
+ // {
+ // TextRulerToolkit.log("\t\t"+pSeg.longerPattern);
+ // TextRulerToolkit.log("\t\t"+pSeg.shorterPattern);
+ // }
+ // }
+ // }
+
+ ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
+
+ for (ArrayList<RapierPatternSegment> patternSegmentation : matchedCombinationList) {
+ // TODO filter out possible duplicates ?
+ resultList.addAll(getGeneralizationsForPatternSegmentation(patternSegmentation));
+ }
+ return resultList;
+ }
+
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+ // --- THE FINAL RESULT: a genarlization method
+ // -------------------------------------------------------------------------------------------
+ // ----------------------------------------------------------------------------------------------------------------------------------------
+
+ // input: two sequences of rule items (=patterns) that shall be
+ // generalized... matchings are searched for a optimized search
+ // and to get a not too big count of generalizations...
+ // result: a (probably very large!) list of possible generalizations, e.g.
+ // used for all slotfiller generalizations of two rules...
+ public static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatterns(
+ TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
+ ArrayList<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
+
+ // if (TextRulerToolkit.DEBUG)
+ // {
+ // TextRulerToolkit.log("\tgetGeneralizationsForRuleItemPatterns:");
+ // TextRulerToolkit.log("\tPattern1:"+pattern1);
+ // TextRulerToolkit.log("\tPattern2:"+pattern2);
+ // }
+
+ if (pattern1.size() == 0 && pattern2.size() == 0) {
+ return result; // return empty list
+ } else if (pattern1.size() == pattern2.size()) // both have the same
+ // pattern item count
+ {
+ // generalizing is easy then: simply generalize each pair of items:
+ result = getGeneralizationsForRuleItemPatternsOfEqualSize(pattern1, pattern2);
+ } else {
+ // TextRulerToolkit.logIf(TextRulerToolkit.DEBUG && pattern1.size()
+ // == 0 || pattern2.size() == 0, "SpecialCaseWithZeroLength");
+ result = getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize(pattern1, pattern2);
+ }
+
+ // if (TextRulerToolkit.DEBUG)
+ // {
+ // TextRulerToolkit.log("\t\tGeneralizations: "+result.size());
+ // for (TextRulerRulePattern lggPattern : result)
+ // TextRulerToolkit.log("\t\t\t"+lggPattern);
+ // }
+
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierGeneralizationHelper.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,5 @@
+package org.apache.uima.tm.textruler.rapier;
+
+public class RapierPatternItemMapping extends RapierPatternSegment {
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternItemMapping.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,25 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem;
+import org.apache.uima.tm.textruler.core.TextRulerRulePattern;
+import org.apache.uima.tm.textruler.core.TextRulerToolkit;
+
+public class RapierPatternSegment {
+
+ TextRulerRulePattern shorterPattern = new TextRulerRulePattern();
+
+ TextRulerRulePattern longerPattern = new TextRulerRulePattern();
+
+ public void debugOutput() {
+
+ TextRulerToolkit.log("\n-------------\nShorterList: ");
+ for (TextRulerRuleItem t : shorterPattern)
+ System.out.print(t.getStringForRuleString(null, null, 0, 1, 0, 1, 0) + " ");
+ TextRulerToolkit.log("");
+
+ System.out.print("LongerList: ");
+ for (TextRulerRuleItem t : longerPattern)
+ System.out.print(t.getStringForRuleString(null, null, 0, 1, 0, 1, 0) + " ");
+ TextRulerToolkit.log("");
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPatternSegment.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,54 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import org.eclipse.core.runtime.Plugin;
+import org.osgi.framework.BundleContext;
+
+/**
+ * The activator class controls the plug-in life cycle.
+ */
+public class RapierPlugin extends Plugin {
+
+ // The plug-in ID
+ public static final String PLUGIN_ID = "org.apache.uima.tm.textruler.rapier";
+
+ // The shared instance
+ private static RapierPlugin plugin;
+
+ /**
+ * The constructor
+ */
+ public RapierPlugin() {
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.eclipse.core.runtime.Plugins#start(org.osgi.framework.BundleContext)
+ */
+ @Override
+ public void start(BundleContext context) throws Exception {
+ super.start(context);
+ plugin = this;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.eclipse.core.runtime.Plugin#stop(org.osgi.framework.BundleContext)
+ */
+ @Override
+ public void stop(BundleContext context) throws Exception {
+ plugin = null;
+ super.stop(context);
+ }
+
+ /**
+ * Returns the shared instance
+ *
+ * @return the shared instance
+ */
+ public static RapierPlugin getDefault() {
+ return plugin;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPlugin.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,92 @@
+package org.apache.uima.tm.textruler.rapier;
+
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.uima.tm.textruler.TextRulerPlugin;
+import org.apache.uima.tm.textruler.extension.TextRulerController;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerController;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerParameter;
+import org.eclipse.jface.preference.BooleanFieldEditor;
+import org.eclipse.jface.preference.FieldEditor;
+import org.eclipse.jface.preference.FieldEditorPreferencePage;
+import org.eclipse.jface.preference.IPreferenceStore;
+import org.eclipse.jface.preference.StringFieldEditor;
+import org.eclipse.ui.IWorkbench;
+import org.eclipse.ui.IWorkbenchPreferencePage;
+
+
+public class RapierPreferencePage extends FieldEditorPreferencePage implements
+ IWorkbenchPreferencePage {
+
+ public static String ID = "org.apache.uima.tm.textruler.algorithmPages";
+
+ private TextRulerLearnerController algorithmController;
+
+ private IPreferenceStore store;
+
+ private ArrayList<FieldEditor> fields = new ArrayList<FieldEditor>();
+
+ public RapierPreferencePage() {
+ super(FieldEditorPreferencePage.GRID);TextRulerLearnerController ctrl = TextRulerController.getControllerForID("org.apache.uima.tm.textruler.rapier");
+ this.algorithmController = ctrl;
+ store = TextRulerPlugin.getDefault().getPreferenceStore();
+ setPreferenceStore(store);
+ }
+
+ @Override
+ public void init(IWorkbench workbench) {
+ }
+
+ protected void createFieldEditors() {
+ TextRulerLearnerFactory f = algorithmController.getFactory();
+ TextRulerLearnerParameter[] params = f.getAlgorithmParameters();
+ Map<String, Object> values = f.getAlgorithmParameterStandardValues();
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ TextRulerLearnerParameter p = params[i];
+ String id = algorithmController.getID() + "." + p.id;
+ FieldEditor l = null;
+ switch (p.type) {
+ case ML_BOOL_PARAM: {
+ l = new BooleanFieldEditor(id, p.name, getFieldEditorParent());
+ fields.add(l);
+ addField(l);
+ store.setDefault(id, (Boolean) values.get(p.id));
+ l.setPreferenceStore(store);
+ break;
+ }
+
+ case ML_FLOAT_PARAM:
+ case ML_INT_PARAM:
+ case ML_STRING_PARAM: {
+ l = new StringFieldEditor(id, p.name, getFieldEditorParent());
+ fields.add(l);
+ addField(l);
+ store.setDefault(id, values.get(p.id).toString());
+ l.setPreferenceStore(store);
+ break;
+ }
+ case ML_SELECT_PARAM:
+ break;
+ }
+ }
+ }
+ }
+
+ @Override
+ protected void performDefaults() {
+ for (FieldEditor f : fields)
+ f.loadDefault();
+ // super.performDefaults();
+ }
+
+ @Override
+ public boolean performOk() {
+ for (FieldEditor f : fields)
+ f.store();
+ // return super.performOk();
+ return true;
+ }
+}
\ No newline at end of file
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler.rapier/src/main/java/org/apache/uima/tm/textruler/rapier/RapierPreferencePage.java
------------------------------------------------------------------------------
svn:mime-type = text/plain