You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/12 12:32:54 UTC
svn commit: r1157037 [4/10] - in
/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler: ./ .settings/
META-INF/ icons/ schema/ src/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/uima/ src/main/jav...
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,579 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerExampleDocument;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleList;
+import org.apache.uima.textmarker.textruler.core.TextRulerShiftExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.util.FileUtils;
+
+public abstract class BasicLP2 extends TextRulerBasicLearner {
+
+ public static final String WINDOW_SIZE_KEY = "windowSize";
+
+ public static final String CURRENT_BEST_RULES_SIZE_KEY = "currentBestRulesSize";
+
+ public static final String CURRENT_CONTEXTUAL_RULES_SIZE_KEY = "currentContextualRulesSize";
+
+ public static final String MIN_COVERED_POSITIVES_PER_RULE_KEY = "minCoveredPositivesPerRule";
+
+ public static final String MAX_ERROR_THRESHOLD_KEY = "maxErrorThreshold";
+
+ public static final int STANDARD_WINDOW_SIZE = 2;
+
+ public static final int STANDARD_MAX_CURRENT_BEST_RULES_COUNT = 4;
+
+ public static final int STANDARD_MAX_CONTEXTUAL_RULES_COUNT = 4;
+
+ public static final int STANDARD_MIN_COVERED_POSITIVES_PER_RULE = 1;
+
+ public static final float STANDARD_MAX_ERROR_THRESHOLD = 0.1f;
+
+ public static final String CORRECTION_ANNOTATION_NAME = "lp2shift";
+
+ private static final int STANDARD_SHIFT_SIZE = 2;
+
+ protected int maxCurrentBestRulesCount = STANDARD_MAX_CURRENT_BEST_RULES_COUNT;
+
+ protected int maxCurrentContextualRulesCount = STANDARD_MAX_CONTEXTUAL_RULES_COUNT;
+
+ protected int windowSize = STANDARD_WINDOW_SIZE;
+
+ protected int shiftSize = STANDARD_SHIFT_SIZE;
+
+ protected int minCoveredPositives = STANDARD_MIN_COVERED_POSITIVES_PER_RULE;
+
+ protected float maxErrorThreshold = STANDARD_MAX_ERROR_THRESHOLD;
+
+ protected List<TextRulerExample> examples;
+
+ protected Set<TextRulerExample> coveredExamples;
+
+ protected int slotMaximumTokenCount = 0;
+
+ protected LP2CurrentBestRulesQueue currentBestRules;
+
+ protected LP2CurrentBestRulesQueue currentContextualRules;
+
+ protected TextRulerRuleList bestRulesPool;
+
+ protected TextRulerRuleList contextRulesPool;
+
+ protected String leftBoundaryBestRulesString = null;
+
+ protected String rightBoundaryBestRulesString = null;
+
+ protected String leftBoundaryContextualRulesString = null;
+
+ protected String rightBoundaryContextualRulesString = null;
+
+ public BasicLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, delegate);
+ }
+
+ protected TextRulerRuleList learnTaggingRules(TextRulerTarget target,
+ TextRulerRuleList contextualRules) {
+ if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ sendStatusUpdateToDelegate("Creating Left-Boundary Examples...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ else if (target.type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ sendStatusUpdateToDelegate("Creating Right-Boundary Examples...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ else if (target.type == MLTargetType.SINGLE_LEFT_CORRECTION)
+ sendStatusUpdateToDelegate("Creating Left Correction Examples...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ else
+ // if (target.type == MLTargetType.SINGLE_RIGHT_CORRECTION)
+ sendStatusUpdateToDelegate("Creating Right Correction Examples...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ exampleDocuments.clearCurrentExamples();
+ exampleDocuments.createExamplesForTarget(target);
+ examples = exampleDocuments.getAllPositiveExamples();
+
+ if (shouldAbort())
+ return null;
+ bestRulesPool = new TextRulerRuleList();
+ contextRulesPool = new TextRulerRuleList();
+ coveredExamples = new HashSet<TextRulerExample>();
+ int roundNumber = 0;
+ for (TextRulerExample e : examples)
+ if (!coveredExamples.contains(e)) {
+ if (shouldAbort())
+ break;
+ roundNumber++;
+ currentBestRules = new LP2CurrentBestRulesQueue(maxCurrentBestRulesCount);
+ currentContextualRules = new LP2CurrentBestRulesQueue(maxCurrentContextualRulesCount);
+ // TextRulerToolkit.log("Example: "+e.getAnnotation().getBegin()+" : "+e.getAnnotation().getEnd());
+
+ induceRulesFromExample(e, roundNumber);
+
+ // TextRulerToolkit.log("Best Rules from this Seed: "+currentBestRules.size());
+ // if (TextRulerToolkit.DEBUG && currentBestRules.size()>1)
+ // {
+ // for (TextRulerRule r : currentBestRules)
+ // {
+ // TextRulerToolkit.log("\tp="+r.getCoveringStatistics().getCoveredPositivesCount()+"; n="+r.getCoveringStatistics().getCoveredNegativesCount()+"; "+r.getRuleString());
+ // for (TextRulerExample ex :
+ // r.getCoveringStatistics().getCoveredPositiveExamples())
+ // {
+ // TextRulerToolkit.log("\t\te="+ex.getAnnotation().getBegin());
+ //
+ // }
+ // }
+ // }
+ for (LP2Rule bestRule : currentBestRules) {
+ addToFinalBestRulesPool(bestRule);
+ }
+ for (LP2Rule ctxRule : currentContextualRules) {
+ addToFinalContextRulesPool(ctxRule);
+ }
+ sendStatusUpdateToDelegate("New Rules added.", TextRulerLearnerState.ML_RUNNING, true);
+ }
+ if (TextRulerToolkit.DEBUG) {
+ bestRulesPool.saveToRulesFile(getIntermediateRulesFileName(), getTMFileHeaderString());
+ // for (TextRulerRule r : bestRulesPool)
+ // {
+ // TextRulerToolkit.log("p="+r.getCoveringStatistics().getCoveredPositivesCount()+"; n="+r.getCoveringStatistics().getCoveredNegativesCount()+"; "+r.getRuleString());
+ // }
+ }
+
+ TextRulerRuleList result = bestRulesPool;
+ if (contextualRules != null)
+ for (TextRulerRule r : contextRulesPool)
+ contextualRules.add(r);
+ bestRulesPool = null;
+ contextRulesPool = null;
+ return result;
+ }
+
+ @Override
+ public CAS loadCAS(String fileName, CAS reuseCAS) {
+ CAS cas = super.loadCAS(fileName, reuseCAS);
+ prepareCASWithBoundaries(cas);
+ return cas;
+ }
+
+ public void prepareCASWithBoundaries(CAS cas) {
+ for (String slotName : slotNames)
+ TextRulerExampleDocument.createBoundaryAnnotationsForCas(cas, slotName, filterSet);
+ }
+
+ public void prepareCachedCASesWithBoundaries() {
+ for (CAS cas : exampleDocuments.getCachedCASes())
+ prepareCASWithBoundaries(cas);
+ }
+
+ @Override
+ protected void cleanUp() {
+ super.cleanUp();
+ examples = null;
+ coveredExamples = null;
+ currentBestRules = null;
+ currentContextualRules = null;
+ bestRulesPool = null;
+ contextRulesPool = null;
+ }
+
+ @Override
+ protected void doRun() {
+ TextRulerToolkit.logIfDebug("--- LP2 START");
+
+ prepareCachedCASesWithBoundaries(); // if some cases are already loaded,
+ // prepare them! all others get prepared when loaded (see loadCAS)
+
+ for (int i = 0; i < slotNames.length; i++) {
+ runForSlotName(slotNames[i]);
+ }
+
+ sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
+ TextRulerToolkit.logIfDebug("--- LP2 END");
+ }
+
+ protected void runForSlotName(String slotName) {
+ // 1. get slot length histogram in order to find maximum slot length
+ // (counted in tokens)
+
+ sendStatusUpdateToDelegate("Creating slot length histogram...",
+ TextRulerLearnerState.ML_RUNNING, false);
+ List<Integer> histogram = exampleDocuments.getTokenCountHistogrammForSlotName(slotName,
+ TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet));
+ if (shouldAbort())
+ return;
+ slotMaximumTokenCount = histogram.size() - 1; // -1 since the
+ // zero-histogram point
+ // also needs a place!
+
+ TextRulerRuleList ctxRules = new TextRulerRuleList();
+ TextRulerRuleList bestRules = learnTaggingRules(new TextRulerTarget(slotName,
+ MLTargetType.SINGLE_LEFT_BOUNDARY, this), ctxRules); // learn
+ // left
+ // boundary
+ // best
+ // rules
+ if (bestRules != null) {
+ leftBoundaryBestRulesString = bestRules.getRulesString("");
+ leftBoundaryContextualRulesString = ctxRules.getRulesString("\t");
+ bestRules.clear(); // free som memory/references
+ }
+ if (shouldAbort())
+ return;
+ ctxRules.clear();
+ bestRules = learnTaggingRules(new TextRulerTarget(slotName, MLTargetType.SINGLE_RIGHT_BOUNDARY,
+ this), ctxRules); // learn
+ // right
+ // boundary best
+ // rules
+ if (bestRules != null) {
+ rightBoundaryBestRulesString = bestRules.getRulesString("");
+ rightBoundaryContextualRulesString = ctxRules.getRulesString("\t");
+ }
+
+ // TODO add correction rule learn stuff
+ // testTaggingRulesAndCreateCorrectionRulesExamples(null, STANDARD_MAX_CONTEXTUAL_RULES_COUNT)
+
+ File file = new File(tempDirectory() + "rules.tm");
+ String resultString;
+ try {
+
+ // = getResultString();
+ // System.out.println(resultString);
+ // resultString =
+ // resultString.replaceAll("NUM[{]REGEXP[(]\"12\"[)][-][>]MARKONCE[(]stimeSTART[)][}];",
+ // "NUM{REGEXP(\"12\")} ALL{->MARKONCE(stimeSTART)};");
+ // System.out.println(resultString);
+
+ resultString = "PACKAGE org.apache.uima.ml;\n\nDocument{->FILTERTYPE(SPACE, BREAK, NBSP, MARKUP)};\n";
+ // resultString += "NUM{REGEXP(\"12\")} ALL{->MARKONCE(stimeSTART)};";
+ FileUtils.saveString2File(resultString, file);
+ } catch (IOException e) {
+ // TODO send text to ui
+ }
+
+ // correct left start
+ TextRulerTarget lsTarget = new TextRulerTarget(slotName, MLTargetType.SINGLE_LEFT_CORRECTION,
+ this);
+ lsTarget.setMaxShiftDistance(shiftSize);
+ TextRulerRuleList correctLeftRules = learnTaggingRules(lsTarget, null);
+
+ // resultString = "CAP{REGEXP(\"PM\")} ALL{->MARKONCE(stimeEND)};";
+ // try {
+ // FileUtils.saveString2File(resultString, file);
+ // } catch (IOException e) {
+ // // TODO Auto-generated catch block
+ // e.printStackTrace();
+ // }
+
+ // correct right start
+ // TextRulerTarget rsTarget = new TextRulerTarget(slotName,
+ // MLTargetType.SINGLE_RIGHT_CORRECTION,
+ // this);
+ // rsTarget.setMaxShiftDistance(shiftSize);
+ // TextRulerRuleList correctRightRules = learnTaggingRules(rsTarget, null);
+ //
+ sendStatusUpdateToDelegate("SLOT Done", TextRulerLearnerState.ML_RUNNING, true);
+ TextRulerToolkit.logIfDebug("--- LP2 END FOR SLOT:" + slotName);
+ }
+
+ protected abstract void induceRulesFromExample(TextRulerExample e, int roundNumber);
+
+ protected void addToFinalContextRulesPool(LP2Rule rule) {
+ if (TextRulerToolkit.DEBUG && false)
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxpool.tm", rule.getRuleString()
+ + "\n");
+
+ if (!contextRulesPool.contains(rule)) {
+ contextRulesPool.add(rule);
+ // TextRulerToolkit.log("CONTEXT RULE: "+rule.getRuleString()+" ; "+rule.getCoveringStatistics());
+ } else {
+ if (TextRulerToolkit.DEBUG && false) {
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxpool.tm", "\tDUPLICATE\n");
+ }
+ }
+
+ }
+
+ protected void addToFinalBestRulesPool(LP2Rule rule) {
+ if (TextRulerToolkit.DEBUG && false)
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "bestpool.tm", rule.getRuleString()
+ + "\n");
+
+ if (!bestRulesPool.contains(rule)) {
+ bestRulesPool.add(rule);
+ // TextRulerToolkit.log("BEST RULE: "+rule.getRuleString());
+ // add all covered positives to covering set
+ coveredExamples.addAll(rule.getCoveringStatistics().getCoveredPositiveExamples());
+ if (TextRulerToolkit.DEBUG)
+ bestRulesPool.saveToRulesFile(getIntermediateRulesFileName(), getTMFileHeaderString());
+ } else {
+ if (TextRulerToolkit.DEBUG && false) {
+ TextRulerToolkit.log("KANN SOWAS PASSIEREN ??");
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "bestpool.tm", "\tDUPLICATE\n");
+ }
+ }
+
+ }
+
+ public String getResultString() {
+ String result = getTMFileHeaderString();
+ result += "// LEFT BOUNDARY RULES:\n";
+ if (leftBoundaryBestRulesString != null) {
+ result += leftBoundaryBestRulesString;
+ result += "\n// RIGHT BOUNDARY RULES:\n";
+ if (rightBoundaryBestRulesString != null)
+ result += rightBoundaryBestRulesString;
+ else if (bestRulesPool != null)
+ result += bestRulesPool.getRulesString("");
+
+ result += "\nBLOCK(contextualRules) Document{}\n" + "{\n"
+ + "\tDocument{->ASSIGN(redoContextualRules, false)}; // reset flag\n";
+ result += "\n\t// LEFT BOUNDARY CONTEXTUAL RULES:\n";
+ result += leftBoundaryContextualRulesString;
+
+ result += "\n\t// RIGHT BOUNDARY CONTEXTUAL RULES:\n";
+ if (rightBoundaryBestRulesString != null)
+ result += rightBoundaryContextualRulesString;
+ else if (contextRulesPool != null)
+ result += contextRulesPool.getRulesString("\t");
+
+ result += "\n\tDocument{IF(redoContextualRules)->CALL(thisFile.contextualRules)};\n}\n";
+ } else if (bestRulesPool != null) {
+ result += bestRulesPool.getRulesString("");
+ result += "\n\t// LEFT BOUNDARY CONTEXTUAL RULES:\n";
+ if (contextRulesPool != null)
+ result += contextRulesPool.getRulesString("");
+ }
+ String leftBoundary = TextRulerToolkit.getTypeShortName((new TextRulerTarget(slotNames[0],
+ MLTargetType.SINGLE_LEFT_BOUNDARY, this)).getSingleSlotTypeName());
+ String rightBoundary = TextRulerToolkit.getTypeShortName((new TextRulerTarget(slotNames[0],
+ MLTargetType.SINGLE_RIGHT_BOUNDARY, this)).getSingleSlotTypeName());
+ String slotMarkName = TextRulerToolkit.getTypeShortName(slotNames[0]);
+ int maxInnerLength = (slotMaximumTokenCount * 3) - 2;
+ result += "\n//slot-building rules:\n";
+ result += leftBoundary + "{IS(" + rightBoundary + ")->UNMARK(" + leftBoundary + "), UNMARK("
+ + rightBoundary + "), MARKONCE(" + slotMarkName + ")};\n";
+ result += leftBoundary + "{->UNMARK(" + leftBoundary + ")} ";
+ if (maxInnerLength > 0) {
+ result += "ANY[0, " + maxInnerLength + "]? ";
+ result += rightBoundary + "{->UNMARK(" + rightBoundary + "), MARKONCE(" + slotMarkName
+ + ", 1, 3)};\n";
+ } else
+ result += rightBoundary + "{->UNMARK(" + rightBoundary + "), MARKONCE(" + slotMarkName
+ + ", 1, 2)};\n";
+
+ result += "\n//cleaning up:\n" + leftBoundary + "{->UNMARK(" + leftBoundary + ")};\n"
+ + rightBoundary + "{->UNMARK(" + rightBoundary + ")};\n";
+ return result;
+ }
+
+ public void setParameters(Map<String, Object> params) {
+ if (TextRulerToolkit.DEBUG)
+ saveParametersToTempFolder(params);
+
+ // TODO try catch
+ if (params.containsKey(WINDOW_SIZE_KEY))
+ windowSize = (Integer) params.get(WINDOW_SIZE_KEY);
+
+ if (params.containsKey(CURRENT_BEST_RULES_SIZE_KEY))
+ maxCurrentBestRulesCount = (Integer) params.get(CURRENT_BEST_RULES_SIZE_KEY);
+
+ if (params.containsKey(CURRENT_CONTEXTUAL_RULES_SIZE_KEY))
+ maxCurrentContextualRulesCount = (Integer) params.get(CURRENT_CONTEXTUAL_RULES_SIZE_KEY);
+
+ if (params.containsKey(MIN_COVERED_POSITIVES_PER_RULE_KEY))
+ minCoveredPositives = (Integer) params.get(MIN_COVERED_POSITIVES_PER_RULE_KEY);
+
+ if (params.containsKey(MAX_ERROR_THRESHOLD_KEY))
+ maxErrorThreshold = (Float) params.get(MAX_ERROR_THRESHOLD_KEY);
+ }
+
+ protected String correctionRulesInputDirectory(TextRulerTarget target) {
+ if (target.isLeftBoundary())
+ return tempDirectory() + "leftCorrectionDocs";
+ else
+ return tempDirectory() + "rightCorrectionDocs";
+ }
+
+ protected boolean testTaggingRulesAndCreateCorrectionRulesExamples(TextRulerTarget target,
+ int maxDistance) {
+ try {
+ File dir = new File(correctionRulesInputDirectory(target));
+ if (!dir.exists())
+ dir.mkdir();
+ exampleDocuments.clearCurrentExamples();
+ exampleDocuments.createExamplesForTarget(target);
+ examples = exampleDocuments.getAllPositiveExamples();
+
+ TextRulerExampleDocument[] sortedDocs = exampleDocuments
+ .getSortedDocumentsInCacheOptimizedOrder();
+ TypeSystem ts = sortedDocs[0].getCAS().getTypeSystem();
+ Type tokensRootType = ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+
+ // String allRulesContent = getResultString();
+ String allRulesContent = FileUtils.file2String(new File("/testinput/testrules/rules.tm"));
+ FileUtils.saveString2File(allRulesContent, new File(getTempRulesFileName()));
+
+ CAS testCAS = getTestCAS();
+ for (TextRulerExampleDocument doc : sortedDocs) {
+ TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
+ doc.resetAndFillTestCAS(testCAS, target);
+ CAS docCAS = doc.getCAS();
+ ae.process(testCAS);
+ compareOriginalDocumentWithTestCAS(doc, testCAS, target, c, true); // test whole ruleset and
+ // collect negative
+ // examples
+
+ // now we have some covered positive examples that are good, and
+ // maybe some negative examples
+ // for that we might create Correction Rules... in order to do
+ // that we have to create
+ // ShiftExamples and map negative examples (incorrect inserted
+ // boundaries) with a specific
+ // distance to an original positive example...
+
+ // TODO should that be done in both directions ? left and right
+ // ?! what happes if we
+ // find two potential examples, one left, one right ? --> for
+ // now: use the nearer one. if
+ // exactly the same distance, use the one where the wrong tag
+ // would be IN the slot filler!
+ List<TextRulerExample> correctTags = doc.getPositiveExamples();
+ List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(
+ c.getCoveredNegativeExamples());
+ List<TextRulerShiftExample> newExamples = new ArrayList<TextRulerShiftExample>();
+ for (TextRulerExample wrongTag : wrongTags) {
+ // test, if there's a corresponding positive example
+ // somewhere around (within maxDistance)
+ List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag
+ .getAnnotation().getBegin(), maxDistance, TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
+ List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag
+ .getAnnotation().getEnd(), maxDistance, TextRulerToolkit
+ .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
+
+ // TODO stop after the first found match or create one bad
+ // example for each found occurence ??!!
+ // for now: stop after one ! so create only ONE bad
+ // example...
+ int leftDistance = 0;
+ TextRulerExample leftCorrectTag = null;
+ for (int i = left.size() - 1; i >= 0; i--) {
+ leftDistance++;
+ TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i),
+ doc, target, docCAS.getTypeSystem());
+ leftCorrectTag = TextRulerToolkit.exampleListContainsAnnotation(correctTags, needle);
+ if (leftCorrectTag != null)
+ break;
+ }
+
+ int rightDistance = 0;
+ TextRulerExample rightCorrectTag = null;
+ for (AnnotationFS fs : right) {
+ rightDistance++;
+ TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, doc,
+ target, docCAS.getTypeSystem());
+ rightCorrectTag = TextRulerToolkit.exampleListContainsAnnotation(correctTags, needle);
+ if (rightCorrectTag != null)
+ break;
+ }
+
+ TextRulerExample theCorrectTag = null;
+ if (rightDistance < leftDistance && rightCorrectTag != null)
+ theCorrectTag = rightCorrectTag;
+ else if (rightDistance > leftDistance && leftCorrectTag != null)
+ theCorrectTag = leftCorrectTag;
+ else // use the one that would lie in the slot filler:
+ {
+ if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null)
+ theCorrectTag = rightCorrectTag;
+ else
+ theCorrectTag = leftCorrectTag;
+ }
+
+ if (theCorrectTag != null) {
+ TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!");
+ TextRulerShiftExample shiftExample = new TextRulerShiftExample(doc,
+ wrongTag.getAnnotation(), theCorrectTag.getAnnotation(), true, target);
+ newExamples.add(shiftExample);
+ }
+ }
+ TextRulerToolkit
+ .writeCAStoXMIFile(testCAS, dir + File.pathSeparator + doc.getCasFileName());
+ }
+ testCAS.reset();
+ } catch (Exception e) {
+ e.printStackTrace();
+ return false;
+ }
+
+ return true;
+ }
+
+ @Override
+ public String getTMFileHeaderString() {
+ return super.getTMFileHeaderString() + "BOOLEAN redoContextualRules;\n\n";
+ }
+
+ @Override
+ protected boolean checkForMandatoryTypes() {
+ if (!super.checkForMandatoryTypes())
+ return false;
+
+ CAS someCas = getTestCAS();
+ TypeSystem ts = someCas.getTypeSystem();
+ // check if all helper types are present:
+ List<String> list = new ArrayList<String>();
+
+ // only the first slot is important for now...
+ list.add(new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_LEFT_BOUNDARY, this)
+ .getSingleSlotTypeName());
+ list.add(new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_RIGHT_BOUNDARY, this)
+ .getSingleSlotTypeName());
+
+ // TODO add correction types here!
+ for (String s : list) {
+ if (ts.getType(s) == null)
+ return false;
+ }
+ return true;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/BasicLP2.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.PriorityQueue;
+
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+
+public class LP2CurrentBestRulesQueue implements Iterable<LP2Rule> {
+
+ private PriorityQueue<LP2Rule> ruleList;
+
+ private PriorityQueue<LP2Rule> reverseRuleList;
+
+ private int maxSize;
+
+ private static int cmpRules(LP2Rule o1, LP2Rule o2) {
+ // 1st criterion: sort by decreasing number of positive matches:
+ if (o1.getCoveringStatistics().getCoveredPositivesCount() > o2.getCoveringStatistics()
+ .getCoveredPositivesCount())
+ return -1;
+ else if (o1.getCoveringStatistics().getCoveredPositivesCount() < o2.getCoveringStatistics()
+ .getCoveredPositivesCount())
+ return 1;
+ else {
+ // 2nd criterion: sort by increasing error rate:
+ if (o1.getErrorRate() < o2.getErrorRate())
+ return -1;
+ else if (o1.getErrorRate() > o2.getErrorRate())
+ return 1;
+ else {
+ // 3rd criterion:
+ // TODO:
+ // if one rule has more positive matches than a threshold then
+ // prefere the one with more generic
+ // conditions; else prefer the other one.
+
+ // test for now: prefer more general rules !
+ int c1 = o1.totalConstraintCount();
+ int c2 = o2.totalConstraintCount();
+ if (c1 < c2)
+ return -1;
+ else if (c1 > c2)
+ return 1;
+ else {
+ return o1.getRuleString().compareTo(o2.getRuleString());
+ }
+ }
+ }
+
+ }
+
+ public LP2CurrentBestRulesQueue(int maxSize) {
+ this.maxSize = Math.max(1, maxSize);
+
+ ruleList = new PriorityQueue<LP2Rule>(this.maxSize, new Comparator<LP2Rule>() {
+ public int compare(LP2Rule o1, LP2Rule o2) {
+ return cmpRules(o1, o2);
+ }
+ });
+ this.reverseRuleList = new PriorityQueue<LP2Rule>(this.maxSize, new Comparator<LP2Rule>() {
+ public int compare(LP2Rule o1, LP2Rule o2) {
+ return -cmpRules(o1, o2);
+ }
+ });
+ }
+
+ public Iterator<LP2Rule> iterator() {
+ return ruleList.iterator();
+ }
+
+ public void clear() {
+ ruleList.clear();
+ reverseRuleList.clear();
+ }
+
+ public void addAll(Collection<LP2Rule> rules) {
+ for (LP2Rule r : rules)
+ add(r);
+ }
+
+ public void add(LP2Rule rule) {
+ ruleList.add(rule);
+ reverseRuleList.add(rule);
+ // TextRulerToolkit.log("ADD TO RULE LIST, SIZE="+ruleList.size()+" revSize="+reverseRuleList.size());
+ // for (LP2Rule r : ruleList)
+ // {
+ // TextRulerToolkit.log("Rule Value: "+r.getPriority()+" peek: "+ruleList.peek().getPriority());
+ // }
+ }
+
+ public boolean contains(LP2Rule rule) {
+ return ruleList.contains(rule);
+ }
+
+ // returns the removed objects
+ public Collection<LP2Rule> cutToMaxSize() {
+ ArrayList<LP2Rule> result = new ArrayList<LP2Rule>();
+ while (ruleList.size() > maxSize) {
+ Object tail = reverseRuleList.peek();
+ ruleList.remove(tail);
+ reverseRuleList.remove(tail);
+ result.add((LP2Rule) tail);
+ }
+ return result;
+ }
+
+ public LP2Rule peek() {
+ return ruleList.peek();
+ }
+
+ public void remove(LP2Rule r) {
+ ruleList.remove(r);
+ reverseRuleList.remove(r);
+ }
+
+ public LP2Rule[] toArray() {
+ LP2Rule[] result = new LP2Rule[ruleList.size()];
+ int i = 0;
+ for (LP2Rule r : ruleList) {
+ result[i] = r;
+ i++;
+ }
+ return result;
+ }
+
+ protected void removeSubsumedRules() {
+ ArrayList<LP2Rule> removeList = new ArrayList<LP2Rule>();
+ LP2Rule[] rulesArray = toArray();
+ for (int index1 = 0; index1 < rulesArray.length - 1; index1++) {
+ LP2Rule rule1 = rulesArray[index1];
+ for (int index2 = index1 + 1; index2 < rulesArray.length; index2++) {
+ LP2Rule rule2 = rulesArray[index2];
+ if (rule1.getCoveringStatistics().getCoveredPositiveExamples().containsAll(
+ rule2.getCoveringStatistics().getCoveredPositiveExamples()))
+ removeList.add(rule2);
+ }
+
+ }
+ // if (TextRulerToolkit.DEBUG && removeList.size() > 0)
+ // TextRulerToolkit.log("[removeSubsumedRules] REMOVED "+removeList.size()+" RULES");
+ for (LP2Rule r : removeList)
+ remove(r);
+ }
+
+ public void printDebug() {
+ TextRulerToolkit.log("-------CURRENT BEST RULES:");
+ for (LP2Rule r : ruleList) {
+ TextRulerToolkit.log(r.getRuleString() + " " + r.getCoveringStatistics() + " error="
+ + r.getErrorRate() + " constraints=" + r.totalConstraintCount());
+ }
+ }
+
+ public int size() {
+ return ruleList.size();
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2CurrentBestRulesQueue.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import org.apache.uima.textmarker.textruler.core.TextRulerBasicLearner;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerSingleSlotRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerStatisticsCollector;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+
+public class LP2Rule extends TextRulerSingleSlotRule {
+
+ protected float errorRate;
+
+ protected boolean setIsPreFillerStartRule = false;
+
+ protected boolean isContextualRule = false;
+
+ public LP2Rule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+ super(parentAlgorithm, target);
+
+ }
+
+ // copy constructor:
+ protected LP2Rule(LP2Rule copyFrom) {
+ super(copyFrom);
+ errorRate = copyFrom.errorRate;
+ setIsPreFillerStartRule = copyFrom.setIsPreFillerStartRule;
+ isContextualRule = copyFrom.isContextualRule;
+ }
+
+ @Override
+ public LP2Rule copy() {
+ return new LP2Rule(this);
+ }
+
+ @Override
+ public void setCoveringStatistics(TextRulerStatisticsCollector c) {
+ super.setCoveringStatistics(c);
+ int p = c.getCoveredPositivesCount();
+ int n = c.getCoveredNegativesCount();
+ if (p < 1) {
+ TextRulerToolkit.log("ERROR, A RULE MAY NOT COVER ZERO POSITIVE EXAMPLES! WHAT'S WRONG ?");
+ TextRulerToolkit.log("\tRULE: " + getRuleString());
+ // make sure this rule is rated totally bad:
+ errorRate = Float.MAX_VALUE;
+ } else {
+ errorRate = ((float) n) / ((float) p);
+ }
+ }
+
+ public float getErrorRate() {
+ return errorRate;
+ }
+
+ public int totalConstraintCount() {
+ int result = 0;
+ // every item itself counts 1 (so a wildcard "ANY" item counts also as a
+ // constraint, since it says: HERE HAS TO BE A TOKEN!)
+ // and every constraint on that token also counts 1.
+ for (TextRulerRuleItem i : slotPattern.preFillerPattern)
+ result += 1 + ((LP2RuleItem) i).totalConstraintCount();
+ for (TextRulerRuleItem i : slotPattern.postFillerPattern)
+ result += 1 + ((LP2RuleItem) i).totalConstraintCount();
+ return result;
+ }
+
+ public int totalInnerConstraintCount() // counts only the constraints in the
+ // items, but does not count the
+ // items as a constraint themselves!
+ {
+ int result = 0;
+ for (TextRulerRuleItem i : slotPattern.preFillerPattern)
+ result += ((LP2RuleItem) i).totalConstraintCount();
+ for (TextRulerRuleItem i : slotPattern.postFillerPattern)
+ result += ((LP2RuleItem) i).totalConstraintCount();
+ return result;
+ }
+
+ public boolean isPreFillerStartRule() {
+ return setIsPreFillerStartRule;
+ }
+
+ public void setIsPreFillerStartRule(boolean flag) {
+ setIsPreFillerStartRule = flag;
+ }
+
+ public LP2RuleItem getMarkingRuleItem() {
+ if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ return (LP2RuleItem) slotPattern.postFillerPattern.get(0);
+ else
+ return (LP2RuleItem) slotPattern.preFillerPattern
+ .get(slotPattern.preFillerPattern.size() - 1);
+ }
+
+ public boolean isContextualRule() {
+ return isContextualRule;
+ }
+
+ public void setIsContextualRule(boolean flag) {
+ if (flag != isContextualRule) {
+ isContextualRule = flag;
+ setNeedsCompile(true);
+ }
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2Rule.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,302 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.Type;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerSingleSlotRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.core.TextRulerWordConstraint;
+
+public class LP2RuleItem implements TextRulerRuleItem {
+
+ protected TextRulerWordConstraint wordConstraint;
+
+ protected MLLP2ContextConstraint contextConstraint;
+
+ protected List<MLLP2OtherConstraint> otherConstraints = new ArrayList<MLLP2OtherConstraint>();
+
+ public static class MLLP2ContextConstraint {
+ private String contextBoundaryName;
+
+ private int contextSize;
+
+ private boolean direction; // false = left; true = right;
+
+ public MLLP2ContextConstraint(int contextSize, LP2Rule parentRule) {
+ super();
+ this.contextSize = contextSize;
+ contextBoundaryName = TextRulerToolkit.getTypeShortName(parentRule.getTarget()
+ .getCounterPartBoundaryTarget().getSingleSlotTypeName());
+ direction = parentRule.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY ? true : false;
+ }
+
+ public MLLP2ContextConstraint(MLLP2ContextConstraint copyFrom) {
+ super();
+ contextBoundaryName = copyFrom.contextBoundaryName;
+ contextSize = copyFrom.contextSize;
+ direction = copyFrom.direction;
+ }
+
+ public MLLP2ContextConstraint copy() {
+ return new MLLP2ContextConstraint(this);
+ }
+
+ @Override
+ public String toString() {
+ return "NEAR(" + contextBoundaryName + ", 0," + contextSize + ","
+ + (direction ? "true" : "false") + ",true)";
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return toString().equals(((MLLP2ContextConstraint) o).toString());
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode();
+ }
+
+ }
+
+ public static class MLLP2OtherConstraint {
+
+ TextRulerAnnotation tokenAnnotation;
+
+ TextRulerAnnotation constraintAnnotation;
+
+ boolean canBeAnchor;
+
+ Type type;
+
+ public MLLP2OtherConstraint(TextRulerAnnotation tokenAnnotation,
+ TextRulerAnnotation constraintAnnotation) {
+ this.tokenAnnotation = tokenAnnotation;
+ this.constraintAnnotation = constraintAnnotation;
+ this.type = constraintAnnotation.getType();
+ canBeAnchor = (tokenAnnotation.getBegin() == constraintAnnotation.getBegin())
+ && (tokenAnnotation.getEnd() == constraintAnnotation.getEnd());
+ // TODO is the matching END also a requirement ?
+ }
+
+ public boolean isTMBasicTypeTokenConstraint() {
+ return tokenAnnotation == constraintAnnotation;
+ }
+
+ public boolean canBeAnchorConstraint() {
+ return canBeAnchor;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ MLLP2OtherConstraint co = (MLLP2OtherConstraint) o;
+
+ return toString().equals(co.toString()) && (canBeAnchor == co.canBeAnchor);
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode() * (canBeAnchor ? 2 : 1);
+ }
+
+ @Override
+ public String toString() {
+ return type.getShortName();
+ }
+
+ public MLLP2OtherConstraint copy() {
+ return new MLLP2OtherConstraint(tokenAnnotation, constraintAnnotation);
+ }
+
+ }
+
+ public LP2RuleItem(LP2RuleItem copyFrom) {
+ super();
+ if (copyFrom.wordConstraint != null)
+ wordConstraint = copyFrom.wordConstraint.copy();
+ if (copyFrom.contextConstraint != null)
+ contextConstraint = copyFrom.contextConstraint.copy();
+ for (MLLP2OtherConstraint c : copyFrom.otherConstraints)
+ otherConstraints.add(c.copy());
+ }
+
+ public LP2RuleItem() {
+ super();
+ }
+
+ public LP2RuleItem copy() {
+ return new LP2RuleItem(this);
+ }
+
+ @Override
+ public int hashCode() {
+ return toString().hashCode();
+ }
+
+ public boolean equals(TextRulerRuleItem o) {
+ return toString().equals(((LP2RuleItem) o).toString());
+ }
+
+ @Override
+ public String toString() {
+ return getStringForRuleString(null, null, 0, 0, 0, 0, 0);
+ }
+
+ public MLLP2OtherConstraint getTMBasicTypeTokenConstraint() {
+ for (MLLP2OtherConstraint c : otherConstraints)
+ if (c.isTMBasicTypeTokenConstraint())
+ return c;
+ return null;
+ }
+
+ public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+ int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex) {
+
+ String result = "";
+ LP2Rule lp2Rule = (LP2Rule) rule;
+ boolean isMarkingItem = (rule != null)
+ && (((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ && (type == MLRuleItemType.PREFILLER) && (numberInPattern == patternSize - 1)) || ((rule
+ .getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ && (type == MLRuleItemType.POSTFILLER) && (numberInPattern == 0)));
+
+ ArrayList<String> constraints = new ArrayList<String>();
+
+ String anchor = null;
+
+ if (wordConstraint == null)
+ anchor = "ANY";
+ else {
+ if (wordConstraint.isRegExpConstraint()) {
+ anchor = wordConstraint.typeShortName();
+ constraints.add("REGEXP(\"" + wordConstraint + "\")");
+ } else
+ anchor = wordConstraint.toString();
+ }
+
+ if (isMarkingItem && lp2Rule.isContextualRule())
+ constraints.add("-IS(" + ((TextRulerSingleSlotRule) rule).getMarkName() + ")");
+
+ if (contextConstraint != null)
+ constraints.add(contextConstraint.toString());
+
+ MLLP2OtherConstraint anchorConstraint = null;
+ if (wordConstraint == null) {
+
+ // prefer the basic TextMarker constraint as the anchor
+ anchorConstraint = getTMBasicTypeTokenConstraint(); // returns null
+ // if we don't
+ // have one...
+
+ if (anchorConstraint == null) {
+ for (MLLP2OtherConstraint c : otherConstraints)
+ if (c.canBeAnchorConstraint()) {
+ anchorConstraint = c;
+ break;
+ }
+ }
+ for (MLLP2OtherConstraint oc : otherConstraints) {
+ if (oc != anchorConstraint) {
+ if (oc.canBeAnchorConstraint())
+ constraints.add("IS(" + oc + ")");
+ else
+ constraints.add("PARTOF(" + oc + ")");
+ }
+ }
+ if (anchorConstraint != null)
+ anchor = anchorConstraint.toString();
+ }
+
+ if (constraints.size() > 0) {
+ String cStr = "";
+ for (String constraintStr : constraints) {
+ if (cStr.length() > 0)
+ cStr += ", ";
+ cStr += constraintStr;
+ }
+ result += "{" + cStr;
+ }
+
+ if ((rule != null)
+ && (((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ && (type == MLRuleItemType.PREFILLER) && (numberInPattern == patternSize - 1)) || ((rule
+ .getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ && (type == MLRuleItemType.POSTFILLER) && (numberInPattern == 0)))) {
+ // result +=
+ // "(MARKONCE, "+((TextRulerSingleSlotRule)rule).getMarkName()+", "+(numberInRule+1);
+ if (constraints.size() == 0)
+ result += "{";
+ result += "->MARKONCE(" + ((TextRulerSingleSlotRule) rule).getMarkName() + ")";
+ if (lp2Rule.isContextualRule())
+ result += ", ASSIGN(redoContextualRules, true)";
+ result += "}";
+ } else {
+ if (constraints.size() != 0)
+ result += "}";
+ }
+ return anchor + result;
+ }
+
+ public void addOtherConstraint(MLLP2OtherConstraint c) {
+ if (!otherConstraints.contains(c))
+ otherConstraints.add(c);
+ }
+
+ public List<MLLP2OtherConstraint> getOtherConstraints() {
+ return otherConstraints;
+ }
+
+ public void setWordConstraint(TextRulerAnnotation tokenAnnotation) {
+ setWordConstraint(new TextRulerWordConstraint(tokenAnnotation));
+ }
+
+ public void setContextConstraint(MLLP2ContextConstraint c) {
+ contextConstraint = c;
+ }
+
+ public MLLP2ContextConstraint getContextConstraint() {
+ return contextConstraint;
+ }
+
+ public void setWordConstraint(TextRulerWordConstraint c) {
+ wordConstraint = c;
+ }
+
+ public TextRulerWordConstraint getWordConstraint() {
+ return wordConstraint;
+ }
+
+ public void removeConstraintWithName(String name) {
+ otherConstraints.remove(name);
+ }
+
+ public int totalConstraintCount() {
+ return otherConstraints.size() + (wordConstraint != null ? 1 : 0)
+ + (contextConstraint != null ? 1 : 0);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/LP2RuleItem.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.textmarker.textruler.core.TextRulerAnnotation;
+import org.apache.uima.textmarker.textruler.core.TextRulerExample;
+import org.apache.uima.textmarker.textruler.core.TextRulerRule;
+import org.apache.uima.textmarker.textruler.core.TextRulerRuleItem;
+import org.apache.uima.textmarker.textruler.core.TextRulerRulePattern;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget;
+import org.apache.uima.textmarker.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.textmarker.textruler.core.TextRulerToolkit;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.learner.lp2.LP2RuleItem.MLLP2ContextConstraint;
+import org.apache.uima.textmarker.textruler.learner.lp2.LP2RuleItem.MLLP2OtherConstraint;
+import org.apache.uima.util.FileUtils;
+
+public class NaiveLP2 extends BasicLP2 {
+
+ public NaiveLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, delegate);
+ }
+
+ public static final boolean SAVE_DEBUG_INFO_IN_TEMPFOLDER = false;
+
+ @Override
+ protected void induceRulesFromExample(TextRulerExample e, int roundNumber) {
+ LP2Rule baseRule = createInitialRuleForPositiveExample(e);
+ List<LP2Rule> genRules = generalizeRule(baseRule);
+
+ if (shouldAbort())
+ return;
+
+ List<LP2Rule> test = new ArrayList<LP2Rule>();
+
+ // int i=1;
+ // for (LP2Rule newRule : genRules)
+ // {
+ // if (shouldAbort())
+ // return;
+ // sendStatusUpdateToDelegate("Round "+roundNumber+" - Testing proposed generalization "+i+"/"+(genRules.size())+
+ // " - uncovered examples: "+
+ // (examples.size()-coveredExamples.size() + " / "+examples.size()),
+ // TextRulerLearnerState.ML_RUNNING, false);
+ // i++;
+ // testRuleOnDocumentSet(newRule, exampleDocuments);
+ //
+ // checkAndHandleNewRule(newRule);
+ //
+ // if (TextRulerToolkit.DEBUG)
+ // test.add(newRule);
+ // }
+ // new cache and testCAS optimized rule testing:
+
+ sendStatusUpdateToDelegate("Round " + roundNumber + " - Testing " + (genRules.size())
+ + "generalizations... - uncovered examples: "
+ + (examples.size() - coveredExamples.size() + " / " + examples.size()),
+ TextRulerLearnerState.ML_RUNNING, false);
+ testRulesOnDocumentSet(new ArrayList<TextRulerRule>(genRules), exampleDocuments);
+
+ for (LP2Rule newRule : genRules) {
+ checkAndHandleNewRule(newRule);
+ if (TextRulerToolkit.DEBUG)
+ test.add(newRule);
+ }
+
+ if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER) {
+ Collections.sort(test, new Comparator<LP2Rule>() {
+
+ public int compare(LP2Rule o1, LP2Rule o2) {
+ return o1.getRuleString().compareTo(o2.getRuleString());
+ }
+
+ });
+
+ String startend = e.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY ? "left_"
+ : "right_";
+ File file = new File(tempDirectory() + startend + "generalizations" + roundNumber + ".tm");
+ StringBuffer str = new StringBuffer();
+ for (TextRulerRule rule : test) {
+ str.append(rule.getCoveringStatistics() + "\t\t" + rule.getRuleString() + "\n");
+ }
+ try {
+ FileUtils.saveString2File(str.toString(), file);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ }
+
+ protected void checkAndHandleNewRule(LP2Rule rule) {
+ boolean tooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
+ boolean tooManyErrors = rule.getErrorRate() > maxErrorThreshold;
+
+ boolean isBestRule = !(tooFewPositives || tooManyErrors);
+
+ if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "bestcandidates.tm", rule
+ .getRuleString()
+ + "\n");
+
+ if (isBestRule) {
+ currentBestRules.add(rule);
+ currentBestRules.removeSubsumedRules();
+ currentBestRules.cutToMaxSize();
+ } else if (!tooFewPositives) {
+
+ // test in context
+ // in our TM representation, we simply can add a NEAR condition in
+ // the MARKing rule item and retest it on the
+ // corpus. we should do that for all kinds of tags we have, but
+ // currently we only do it for the corresponding opening/closing
+ // tag, since we do not have any information about other slots yet!
+ // // TODO use all other slot tags! (see optimized version as well)
+
+ if (true) {
+ rule = rule.copy();
+ LP2RuleItem item = rule.getMarkingRuleItem();
+ // TextRulerToolkit.log("CONTEXTUAL RULE CANDIDATE: "+rule.getRuleString()+" ; "+rule.getCoveringStatistics());
+ item.setContextConstraint(new MLLP2ContextConstraint(slotMaximumTokenCount, rule));
+ rule.setIsContextualRule(true);
+
+ rule.setNeedsCompile(true);
+
+ if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxcandidates.tm", rule
+ .getRuleString());
+
+ testRuleOnDocumentSet(rule, exampleDocuments); // not very
+ // fast... but
+ // works!
+ boolean ctxTooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
+ boolean ctxTooManyErrors = rule.getErrorRate() > maxErrorThreshold;
+ boolean isGoodContextRule = !(ctxTooFewPositives || ctxTooManyErrors);
+ if (isGoodContextRule) {
+ currentContextualRules.add(rule);
+ currentContextualRules.removeSubsumedRules();
+ currentContextualRules.cutToMaxSize();
+ }
+ }
+
+ }
+ }
+
+ protected List<LP2Rule> generalizeRule(LP2Rule baseRule) {
+ List<LP2Rule> result = new ArrayList<LP2Rule>();
+ TextRulerRulePattern rulePattern = new TextRulerRulePattern();
+ TextRulerRulePattern prePattern = baseRule.getPreFillerPattern();
+
+ for (int i = prePattern.size() - 1; i >= 0; i--) // we have to reverse
+ // the order again!
+ {
+ rulePattern.add(prePattern.get(i));
+ }
+ rulePattern.addAll(baseRule.getPostFillerPattern());
+
+ recursiveGeneralizeRule(baseRule, rulePattern, new TextRulerRulePattern(), result);
+ TextRulerToolkit.log("GENERALIZATIONS: " + result.size());
+
+ for (LP2Rule r : result)
+ removeOutermostWildCardItemsFromRule(r);
+
+ // for (LP2Rule r : result)
+ // {
+ // TextRulerToolkit.log("NEWRULE = "+r.getRuleString());
+ // }
+
+ return result;
+ }
+
+ protected LP2Rule createInitialRuleForPositiveExample(TextRulerExample example) {
+ TextRulerTarget target = example.getTarget();
+ LP2Rule rule = new LP2Rule(this, example.getTarget());
+ CAS docCas = example.getDocumentCAS();
+ TextRulerAnnotation exampleAnnotation = example.getAnnotation();
+ TypeSystem ts = docCas.getTypeSystem();
+ Type tokensRootType = ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+ int thePosition = target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? exampleAnnotation
+ .getBegin() : exampleAnnotation.getEnd();
+
+ List<AnnotationFS> leftContext = TextRulerToolkit.getAnnotationsBeforePosition(docCas,
+ thePosition, windowSize, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+ filterSet), tokensRootType);
+ List<AnnotationFS> rightContext = TextRulerToolkit.getAnnotationsAfterPosition(docCas,
+ thePosition, windowSize, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
+ filterSet), tokensRootType);
+
+ // the left context has to be reversed since we get the arrayList from
+ // the slot's point of view!
+ for (int i = leftContext.size() - 1; i >= 0; i--) {
+ TextRulerAnnotation annot = new TextRulerAnnotation(leftContext.get(i), example.getDocument());
+ LP2RuleItem item = new LP2RuleItem();
+ item.setWordConstraint(annot);
+ if (item.getWordConstraint().isRegExpConstraint())
+ item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
+ rule.addPreFillerItem(item);
+ }
+
+ for (AnnotationFS afs : rightContext) {
+ TextRulerAnnotation annot = new TextRulerAnnotation(afs, example.getDocument());
+ LP2RuleItem item = new LP2RuleItem();
+ item.setWordConstraint(annot);
+ if (item.getWordConstraint().isRegExpConstraint())
+ item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
+
+ rule.addPostFillerItem(item);
+ }
+ TextRulerToolkit.log("INITIAL RULE: " + rule.getRuleString());
+ return rule;
+ }
+
+ protected void recursiveGeneralizeRule(LP2Rule baseRule, TextRulerRulePattern allItems,
+ TextRulerRulePattern currentPattern, List<LP2Rule> resultList) {
+ if (currentPattern.size() == allItems.size()) {
+ // create new Rule
+ LP2Rule newRule = new LP2Rule(this, baseRule.getTarget());
+ int preCount = baseRule.getPreFillerPattern().size();
+ for (int i = 0; i < currentPattern.size(); i++) {
+ if (i < preCount)
+ newRule.addPreFillerItem(currentPattern.get(i));
+ else
+ newRule.addPostFillerItem(currentPattern.get(i));
+ }
+ // TextRulerToolkit.log("GEN: "+newRule.getRuleString());
+ if (newRule.totalInnerConstraintCount() > 0) // skip the ANY ANY ANY
+ // ANY... rule ! this
+ // makes no sense in no
+ // application!!
+ resultList.add(newRule);
+ } else {
+ int index = currentPattern.size();
+ TextRulerRuleItem baseItem = allItems.get(index);
+ List<TextRulerRuleItem> itemGeneralizations = generalizeRuleItem((LP2RuleItem) baseItem);
+ for (TextRulerRuleItem newItem : itemGeneralizations) {
+ currentPattern.add(newItem);
+ recursiveGeneralizeRule(baseRule, allItems, currentPattern, resultList);
+ currentPattern.remove(currentPattern.size() - 1);
+ }
+ }
+ }
+
+ protected void recursiveGeneralizeRuleItem(LP2RuleItem baseItem,
+ List<MLLP2OtherConstraint> otherConstraints, int currentConstraintIndex,
+ List<MLLP2OtherConstraint> currentConstraintTuple, List<TextRulerRuleItem> result) {
+ if (currentConstraintIndex > otherConstraints.size() - 1) {
+ LP2RuleItem newItem;
+ newItem = new LP2RuleItem();
+ for (MLLP2OtherConstraint c : currentConstraintTuple)
+ newItem.addOtherConstraint(c.copy());
+ result.add(newItem);
+ } else {
+ MLLP2OtherConstraint currentConstraint = otherConstraints.get(currentConstraintIndex);
+ // recurse WITH and WITHOUT this key:
+ recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
+ currentConstraintTuple, result);
+ currentConstraintTuple.add(currentConstraint);
+ recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
+ currentConstraintTuple, result);
+ currentConstraintTuple.remove(currentConstraintTuple.size() - 1);
+ }
+ }
+
+ protected List<TextRulerRuleItem> generalizeRuleItem(LP2RuleItem baseItem) {
+ List<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();
+
+ // one with word constraint
+ if (baseItem.getWordConstraint() != null) {
+ LP2RuleItem newItem = new LP2RuleItem();
+ newItem.setWordConstraint(baseItem.getWordConstraint().copy());
+ result.add(newItem);
+ }
+
+ // all other combinations without word constraint
+ // List<String> keys = new
+ // ArrayList<String>(baseItem.getOtherConstraints().keySet());
+ List<MLLP2OtherConstraint> constraints = baseItem.getOtherConstraints();
+ recursiveGeneralizeRuleItem(baseItem, constraints, 0, new ArrayList<MLLP2OtherConstraint>(),
+ result);
+ return result;
+ }
+
+ protected void removeOutermostWildCardItemsFromRule(LP2Rule rule) {
+ while (true) {
+ LP2RuleItem item = (LP2RuleItem) rule.getOutermostPreFillerItem();
+ if (item == null) // no more items left
+ break;
+
+ // if this rule is a RIGHT BOUNDARY rule, we must not remove the
+ // last remaining pre filler item,
+ // since this is used for marking the SLOT END BOUNDARY (= RIGHT
+ // BOUNDARY)
+ if ((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ && (rule.getPreFillerPattern().size() == 1))
+ break;
+
+ if (item.totalConstraintCount() == 0)
+ rule.removeOutermostPreFillerItem();
+ else
+ break;
+ }
+ while (true) {
+ LP2RuleItem item = (LP2RuleItem) rule.getOutermostPostFillerItem();
+ if (item == null) // no more items left
+ break;
+
+ // if this rule is a LEFT BOUNDARY rule, we must not remove the last
+ // remaining post filler item,
+ // since this is used for marking the SLOT START BOUNDARY (= LEFT
+ // BOUNDARY)
+ if ((rule.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ && (rule.getPostFillerPattern().size() == 1))
+ break;
+
+ if (item.totalConstraintCount() == 0)
+ rule.removeOutermostPostFillerItem();
+ else
+ break;
+ }
+ }
+
+ @Override
+ public boolean collectNegativeCoveredInstancesWhenTesting() {
+ return false;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+*/
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearner;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter.MLAlgorithmParamType;
+
+public class NaiveLP2Factory implements TextRulerLearnerFactory {
+
+ public TextRulerLearner createAlgorithm(String inputFolderPath, String additionalFolderPath,
+ String prePropTMFile, String tempFolderPath, String[] fullSlotTypeNames,
+ Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ return new NaiveLP2(inputFolderPath, prePropTMFile, tempFolderPath, fullSlotTypeNames,
+ filterSet, delegate);
+ }
+
+ public TextRulerLearnerParameter[] getAlgorithmParameters() {
+ TextRulerLearnerParameter[] result = new TextRulerLearnerParameter[5];
+
+ result[0] = new TextRulerLearnerParameter(BasicLP2.WINDOW_SIZE_KEY,
+ "Context Window Size (to the left and right)", MLAlgorithmParamType.ML_INT_PARAM);
+ result[1] = new TextRulerLearnerParameter(BasicLP2.CURRENT_BEST_RULES_SIZE_KEY,
+ "Best Rules List Size", MLAlgorithmParamType.ML_INT_PARAM);
+ result[2] = new TextRulerLearnerParameter(BasicLP2.MIN_COVERED_POSITIVES_PER_RULE_KEY,
+ "Minimum Covered Positives per Rule", MLAlgorithmParamType.ML_INT_PARAM);
+ result[3] = new TextRulerLearnerParameter(BasicLP2.MAX_ERROR_THRESHOLD_KEY,
+ "Maximum Error Threshold", MLAlgorithmParamType.ML_FLOAT_PARAM);
+ result[4] = new TextRulerLearnerParameter(BasicLP2.CURRENT_CONTEXTUAL_RULES_SIZE_KEY,
+ "Contextual Rules List Size", MLAlgorithmParamType.ML_INT_PARAM);
+
+ return result;
+ }
+
+ public Map<String, Object> getAlgorithmParameterStandardValues() {
+ Map<String, Object> result = new HashMap<String, Object>();
+ result.put(BasicLP2.WINDOW_SIZE_KEY, BasicLP2.STANDARD_WINDOW_SIZE);
+ result
+ .put(BasicLP2.CURRENT_BEST_RULES_SIZE_KEY,
+ BasicLP2.STANDARD_MAX_CURRENT_BEST_RULES_COUNT);
+ result.put(BasicLP2.MIN_COVERED_POSITIVES_PER_RULE_KEY,
+ BasicLP2.STANDARD_MIN_COVERED_POSITIVES_PER_RULE);
+ result.put(BasicLP2.MAX_ERROR_THRESHOLD_KEY, BasicLP2.STANDARD_MAX_ERROR_THRESHOLD);
+ result.put(BasicLP2.CURRENT_CONTEXTUAL_RULES_SIZE_KEY,
+ BasicLP2.STANDARD_MAX_CONTEXTUAL_RULES_COUNT);
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2Factory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java?rev=1157037&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java (added)
+++ uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java Fri Aug 12 10:32:50 2011
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.textruler.learner.lp2;
+
+import java.util.ArrayList;
+import java.util.Map;
+
+import org.apache.uima.textmarker.textruler.TextRulerPlugin;
+import org.apache.uima.textmarker.textruler.extension.TextRulerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerController;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerFactory;
+import org.apache.uima.textmarker.textruler.extension.TextRulerLearnerParameter;
+import org.eclipse.jface.preference.BooleanFieldEditor;
+import org.eclipse.jface.preference.FieldEditor;
+import org.eclipse.jface.preference.IPreferenceStore;
+import org.eclipse.jface.preference.PreferencePage;
+import org.eclipse.jface.preference.StringFieldEditor;
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Composite;
+import org.eclipse.swt.widgets.Control;
+import org.eclipse.ui.IWorkbench;
+import org.eclipse.ui.IWorkbenchPreferencePage;
+
+public class NaiveLP2PreferencePage extends PreferencePage implements IWorkbenchPreferencePage {
+
+ public static String ID = "org.apache.uima.textmarker.textruler.algorithmPages";
+
+ private TextRulerLearnerController algorithmController;
+
+ private IPreferenceStore store;
+
+ private ArrayList<FieldEditor> fields = new ArrayList<FieldEditor>();
+
+ public NaiveLP2PreferencePage() {
+ TextRulerLearnerController ctrl = TextRulerController
+ .getControllerForID("org.apache.uima.textmarker.textruler.lp2naive");
+ this.algorithmController = ctrl;
+ store = TextRulerPlugin.getDefault().getPreferenceStore();
+ setPreferenceStore(store);
+ }
+
+ @Override
+ protected Control createContents(Composite parent) {
+ Composite top = new Composite(parent, SWT.LEFT);
+ top.setLayoutData(new GridData(GridData.FILL_HORIZONTAL));
+ top.setLayout(new GridLayout());
+
+ TextRulerLearnerFactory f = algorithmController.getFactory();
+ TextRulerLearnerParameter[] params = f.getAlgorithmParameters();
+ Map<String, Object> values = f.getAlgorithmParameterStandardValues();
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ TextRulerLearnerParameter p = params[i];
+ String id = algorithmController.getID() + "." + p.id;
+ FieldEditor l = null;
+ switch (p.type) {
+ case ML_BOOL_PARAM: {
+ l = new BooleanFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, (Boolean) values.get(p.id));
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+
+ case ML_FLOAT_PARAM:
+ case ML_INT_PARAM:
+ case ML_STRING_PARAM: {
+ l = new StringFieldEditor(id, p.name, top);
+ fields.add(l);
+ store.setDefault(id, values.get(p.id).toString());
+ l.setPreferenceStore(store);
+ l.load();
+ break;
+ }
+ case ML_SELECT_PARAM:
+ break;
+ }
+ }
+ }
+ return top;
+ }
+
+ @Override
+ public void init(IWorkbench workbench) {
+ }
+
+ @Override
+ protected void performDefaults() {
+ for (FieldEditor f : fields)
+ f.loadDefault();
+ // super.performDefaults();
+ }
+
+ @Override
+ public boolean performOk() {
+ for (FieldEditor f : fields)
+ f.store();
+ // return super.performOk();
+ return true;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/uimaj-ep-textmarker-textruler/src/main/java/org/apache/uima/textmarker/textruler/learner/lp2/NaiveLP2PreferencePage.java
------------------------------------------------------------------------------
svn:mime-type = text/plain