You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by jo...@apache.org on 2011/08/01 16:21:35 UTC
svn commit: r1152792 [7/10] - in /uima/sandbox/trunk/TextMarker:
org.apache.uima.tm.textruler.lp2/ org.apache.uima.tm.textruler.lp2/META-INF/
org.apache.uima.tm.textruler.lp2/bin/ org.apache.uima.tm.textruler.lp2/src/
org.apache.uima.tm.textruler.lp2/s...
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,468 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.tm.textmarker.engine.TextMarkerEngine;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+import org.apache.uima.tm.textruler.extension.TextRulerLearner;
+import org.apache.uima.tm.textruler.extension.TextRulerLearnerDelegate;
+import org.apache.uima.tm.textruler.tools.MemoryWatch;
+import org.apache.uima.util.FileUtils;
+import org.eclipse.core.runtime.IPath;
+import org.eclipse.core.runtime.Path;
+
+
+/**
+ *
+ * This class provides basic and shared functionality for all the implemented ML algorithms. New
+ * algorithms can subclass this class and use the whole framework for faster development.
+ *
+ */
+public abstract class TextRulerBasicLearner implements TextRulerLearner, CasCacheLoader {
+
+ protected TextRulerLearnerDelegate delegate;
+
+ protected AnalysisEngine ae;
+
+ protected TextRulerExampleDocumentSet exampleDocuments;
+
+ protected String inputDirectory;
+
+ protected String tempDirectory;
+
+ protected String preprocessorTMFile;
+
+ protected Set<String> filterSet;
+
+ protected Set<String> filterSetWithSlotNames;
+
+ protected String[] slotNames;
+
+ protected CasCache casCache;
+
+ protected CAS algTestCAS;
+
+ public TextRulerBasicLearner(String inputDir, String prePropTMFile, String tmpDir,
+ String[] slotNames, Set<String> filterSet, TextRulerLearnerDelegate delegate) {
+ super();
+ this.preprocessorTMFile = prePropTMFile;
+ this.tempDirectory = tmpDir;
+ this.slotNames = slotNames;
+ this.inputDirectory = inputDir;
+ this.delegate = delegate;
+ this.filterSet = filterSet;
+ filterSetWithSlotNames = new HashSet<String>(filterSet);
+ for (String s : slotNames) {
+ filterSetWithSlotNames.add(s);
+ filterSetWithSlotNames.add(TextRulerTarget.getSingleSlotTypeName(
+ MLTargetType.SINGLE_LEFT_BOUNDARY, s));
+ filterSetWithSlotNames.add(TextRulerTarget.getSingleSlotTypeName(
+ MLTargetType.SINGLE_RIGHT_BOUNDARY, s));
+ }
+
+ this.casCache = new CasCache(100, this); // TODO make size configurable
+ // !? share e.g. 100 places for
+ // all running algoritghms ?
+ }
+
+ protected String tempDirectory() {
+ return TextRulerToolkit.addTrailingSlashToPath(tempDirectory);
+ }
+
+ protected boolean shouldAbort() {
+ if (delegate != null)
+ return delegate.shouldAbort();
+ else
+ return false;
+ }
+
+ public AnalysisEngine getAnalysisEngine() {
+ if (ae == null) {
+ String descriptorFile = TextRulerToolkit.getEngineDescriptorFromTMSourceFile(new Path(
+ preprocessorTMFile));
+ sendStatusUpdateToDelegate("loading AE...", TextRulerLearnerState.ML_INITIALIZING, false);
+ ae = TextRulerToolkit.loadAnalysisEngine(descriptorFile);
+
+ // set filters to NO filtering so that we can add it manually with
+ // the FILTERTYPE expression!
+ ae.setConfigParameterValue(TextMarkerEngine.DEFAULT_FILTERED_MARKUPS, new String[0]);
+ String tempRulesFileName = getTempRulesFileName();
+ IPath path = new Path(tempRulesFileName);
+ ae.setConfigParameterValue(TextMarkerEngine.MAIN_SCRIPT, path.removeFileExtension()
+ .lastSegment());
+ ae.setConfigParameterValue(TextMarkerEngine.SCRIPT_PATHS, new String[] { path
+ .removeLastSegments(1).toPortableString() });
+ // ae.setConfigParameterValue(TextMarkerEngine.SEEDERS, new String[] {""});
+ ae.setConfigParameterValue(TextMarkerEngine.ADDITIONAL_SCRIPTS, new String[0]);
+ try {
+ ae.reconfigure();
+ } catch (ResourceConfigurationException e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+ return ae;
+ }
+
+ protected boolean checkForMandatoryTypes() {
+ // check if all passed slot types are present:
+ CAS someCas = getTestCAS();
+ TypeSystem ts = someCas.getTypeSystem();
+ boolean result = true;
+ List<String> missingTypes = new ArrayList<String>();
+ for (String s : slotNames) {
+ if (ts.getType(s) == null) {
+ missingTypes.add(s);
+ result = false;
+ }
+ }
+ String missingString = "";
+ for (String string : missingTypes) {
+ missingString += string + ", ";
+ }
+ if (!missingString.isEmpty()) {
+ missingString = missingString.substring(0, missingString.length() - 2);
+ }
+ sendStatusUpdateToDelegate("Error: Some Slot- or Helper-Types were not found in TypeSystem: "
+ + missingString, TextRulerLearnerState.ML_ERROR, false);
+ return result;
+ }
+
+ protected boolean createTempDirIfNeccessary() {
+ File dir = new File(tempDirectory());
+ if (dir.exists() && dir.isDirectory())
+ return true;
+ else
+ return dir.mkdir();
+ }
+
+ public void run() {
+ if (createTempDirIfNeccessary()) {
+ getAnalysisEngine(); // be sure that our AE was created...
+
+ if (!checkForMandatoryTypes()) {
+
+ } else {
+ sendStatusUpdateToDelegate("Finding documents...", TextRulerLearnerState.ML_INITIALIZING,
+ false);
+ exampleDocuments = new TextRulerExampleDocumentSet(inputDirectory, casCache);
+ if (!shouldAbort()) {
+ sendStatusUpdateToDelegate("Starting...", TextRulerLearnerState.ML_RUNNING, true);
+
+ try {
+ doRun();
+ } catch (Exception e) {
+ e.printStackTrace();
+ sendStatusUpdateToDelegate("Aborted due to exception!", TextRulerLearnerState.ML_ERROR,
+ true);
+ }
+
+ if (TextRulerToolkit.DEBUG) {
+ try {
+ File file = new File(tempDirectory() + "results.tm");
+ FileUtils.saveString2File(getResultString(), file);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ cleanUp();
+ }
+ }
+ casCache.clear();
+ casCache = null;
+ exampleDocuments = null; // clear reference
+ if (algTestCAS != null) {
+ algTestCAS.reset();
+ GlobalCASSource.releaseCAS(algTestCAS); // algTestCAS.release();
+ algTestCAS = null;
+ }
+ if (shouldAbort())
+ sendStatusUpdateToDelegate("Aborted!", TextRulerLearnerState.ML_ABORTED, false);
+ } else {
+ sendStatusUpdateToDelegate("ERROR CREATING TEMPORARY DIRECTORY!",
+ TextRulerLearnerState.ML_ERROR, false);
+ }
+ }
+
+ public CAS loadCAS(String fileName, CAS reuseCAS) {
+ return TextRulerToolkit.readCASfromXMIFile(fileName, ae, reuseCAS);
+ }
+
+ protected void sendStatusUpdateToDelegate(String statusString, TextRulerLearnerState state,
+ boolean ruleBaseChanged) {
+ if (delegate != null)
+ delegate.algorithmStatusUpdate(this, statusString, state, ruleBaseChanged);
+ }
+
+ protected abstract void doRun(); // needs to be implemented by concrete
+
+ // algorithm subclasses !
+
+ protected void cleanUp() {
+
+ }
+
+ public String getTempRulesFileName() {
+ return tempDirectory() + "rules.tm";
+ }
+
+ public String getIntermediateRulesFileName() {
+ return tempDirectory() + "intermediaterules.tm";
+ }
+
+ public void compareOriginalDocumentWithTestCAS(TextRulerExampleDocument originalDoc, CAS testCas,
+ TextRulerTarget target, TextRulerStatisticsCollector c, boolean collectNegativeExamples) {
+ // standard implementation - may be overwritten by concrete subclasses
+ // if needed
+ List<TextRulerExample> originalPositives = originalDoc.getPositiveExamples();
+ List<TextRulerExample> testPositives = originalDoc.createSlotInstancesForCAS(testCas, target,
+ false);
+
+ // TODO if you need false negatives (missing annotations), please
+ // reactivate
+ // the code commented out with FALSENEGATIVES
+
+ for (TextRulerExample e : testPositives) {
+ TextRulerExample coveredExample = TextRulerToolkit.exampleListContainsAnnotation(
+ originalPositives, e.getAnnotation());
+ if (coveredExample != null) {
+ c.addCoveredPositive(coveredExample); // add covered example and
+ // increment positive
+ // counter
+ // FALSENEGATIVES originalPositives.remove(coveredExample);
+ } else {
+ if (collectNegativeExamples) {
+ e.setPositive(false);
+ c.addCoveredNegative(e);
+ } else
+ c.incCoveredNegatives(1);
+ }
+ }
+
+ // FALSENEGATIVES c.incMissingPositives(originalPositives.size());
+ }
+
+ public abstract boolean collectNegativeCoveredInstancesWhenTesting();
+
+ public void testRuleOnDocument(final TextRulerRule rule, final TextRulerExampleDocument doc,
+ final TextRulerStatisticsCollector c) {
+ CAS testCAS = getTestCAS();
+ doc.resetAndFillTestCAS(testCAS, rule.getTarget());
+ testRuleOnDocument(rule, doc, c, testCAS);
+ testCAS.reset();
+ }
+
+ public void testRuleOnDocument(final TextRulerRule rule, final TextRulerExampleDocument doc,
+ final TextRulerStatisticsCollector c, CAS testCas) {
+ if (TextRulerToolkit.DEBUG) {
+ MemoryWatch.watch();
+ }
+ try {
+ rule.saveToRulesFile(getTempRulesFileName());
+ if (TextRulerToolkit.DEBUG) {
+ TextRulerToolkit.writeCAStoXMIFile(testCas, tempDirectory() + "testCas.xmi");
+ }
+ ae.process(testCas);
+ if (TextRulerToolkit.DEBUG) {
+ TextRulerToolkit.writeCAStoXMIFile(testCas, tempDirectory() + "testCasProcessed.xmi");
+ }
+ compareOriginalDocumentWithTestCAS(doc, testCas, rule.getTarget(), c,
+ collectNegativeCoveredInstancesWhenTesting());
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ // if you have many rules to test, please use testRulesOnDocumentSet for
+ // performance issues !!
+ public void testRuleOnDocumentSet(final TextRulerRule rule,
+ final TextRulerExampleDocumentSet documents) {
+ TextRulerStatisticsCollector sum = new TextRulerStatisticsCollector();
+ TextRulerExampleDocument[] sortedDocs = documents.getSortedDocumentsInCacheOptimizedOrder();
+
+ for (TextRulerExampleDocument theDoc : sortedDocs) {
+ testRuleOnDocument(rule, theDoc, sum);
+ if (shouldAbort())
+ break;
+ }
+ rule.setCoveringStatistics(sum);
+ }
+
+ public CAS applyScriptOnDocument(String script, final TextRulerExampleDocument doc,
+ TextRulerTarget target) {
+ String tempRulesFileName = getTempRulesFileName();
+ try {
+ FileUtils.saveString2File(script, new File(tempRulesFileName));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ AnalysisEngine analysisEngine = getAnalysisEngine();
+ CAS testCAS = getTestCAS();
+ doc.resetAndFillTestCAS(testCAS, target);
+ try {
+ analysisEngine.process(testCAS);
+ } catch (AnalysisEngineProcessException e) {
+ e.printStackTrace();
+ }
+ return testCAS;
+ }
+
+ public void testRulesOnDocumentSet(final List<? extends TextRulerRule> rules,
+ final TextRulerExampleDocumentSet documents) {
+ if (rules.isEmpty())
+ return;
+ List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>();
+ TextRulerExampleDocument[] sortedDocs = documents.getSortedDocumentsInCacheOptimizedOrder();
+ TextRulerTarget target = rules.get(0).getTarget();
+
+ for (@SuppressWarnings("unused")
+ TextRulerRule r : rules) {
+ // crate a collector for each rule
+ sums.add(new TextRulerStatisticsCollector());
+ }
+
+ CAS theTestCAS = getTestCAS();
+ for (TextRulerExampleDocument theDoc : sortedDocs) {
+ for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
+ TextRulerRule theRule = rules.get(ruleIndex);
+ TextRulerStatisticsCollector sumC = sums.get(ruleIndex);
+
+ if (TextRulerToolkit.DEBUG && !target.equals(theRule.getTarget())) {
+ TextRulerToolkit
+ .log("[TextRulerBasicLearner.testRulesOnTrainingsSet] ERROR, ALL RULES MUST HAVE THE SAME LEARNING TARGET !");
+ }
+ theDoc.resetAndFillTestCAS(theTestCAS, target);
+ testRuleOnDocument(theRule, theDoc, sumC, theTestCAS);
+ if (shouldAbort())
+ return;
+ }
+ }
+ theTestCAS.reset();
+ // do not release the shared test-cas ! only reset it ! it gets released
+ // at the end of the
+ // whole algorithm !
+ for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++)
+ rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex));
+ }
+
+ public void testRulesOnDocument(final List<? extends TextRulerRule> rules,
+ final TextRulerExampleDocument document) {
+ if (rules.isEmpty())
+ return;
+ List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>();
+ TextRulerTarget target = rules.get(0).getTarget();
+ for (@SuppressWarnings("unused")
+ TextRulerRule r : rules) {
+ // crate a collector for each rule
+ sums.add(new TextRulerStatisticsCollector());
+ }
+ CAS theTestCAS = getTestCAS();
+ for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {
+ TextRulerRule theRule = rules.get(ruleIndex);
+ TextRulerStatisticsCollector sumC = sums.get(ruleIndex);
+
+ if (TextRulerToolkit.DEBUG && !target.equals(theRule.getTarget())) {
+ TextRulerToolkit
+ .log("[TextRulerBasicLearner.testRulesOnTrainingsSet] ERROR, ALL RULES MUST HAVE THE SAME LEARNING TARGET !");
+ }
+ document.resetAndFillTestCAS(theTestCAS, target);
+ testRuleOnDocument(theRule, document, sumC, theTestCAS);
+ if (shouldAbort())
+ return;
+ }
+ theTestCAS.reset();
+ // do not release the shared test-cas ! only reset it ! it gets released
+ // at the end of the
+ // whole algorithm !
+ for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++)
+ rules.get(ruleIndex).setCoveringStatistics(sums.get(ruleIndex));
+ }
+
+ public String getTMFileHeaderString() {
+ return getTMPackageString() + getTypeSystemImport() + getTMFilterCommandString();
+ }
+
+ private String getTypeSystemImport() {
+ return "TYPESYSTEM " + getTypeSystemString(preprocessorTMFile) + ";\n\n";
+ }
+
+ private String getTypeSystemString(String fileString) {
+ File file = new File(fileString);
+ // TODO
+
+ return "org.apache.uima.tm.citie.CompleteTypeSystemTypeSystem";
+ }
+
+ public String getTMPackageString() {
+ return "PACKAGE org.apache.uima.tm;\n\n";
+ }
+
+ public String getTMFilterCommandString() {
+ if (filterSet != null && filterSet.size() > 0) {
+ String fs = "";
+ for (String s : filterSet)
+ if (fs.length() == 0)
+ fs += TextRulerToolkit.getTypeShortName(s);
+ else
+ fs += ", " + TextRulerToolkit.getTypeShortName(s);
+
+ return "Document{->FILTERTYPE(" + fs + ")};\n\n";
+ } else
+ return "";
+ }
+
+ public CAS getTestCAS() {
+ // one big memory problem occured as we .reset+.release old CASes and
+ // created new ones
+ // for every test and (e.g. in CasCache for every loaded XMI). Maybe
+ // this is a
+ // UIMA memory issue ? Changing this to an almost static amount of CAS
+ // objects and reusing
+ // them works without leaking, so we prefer this now since it also
+ // brought a performance
+ // boost!
+ if (algTestCAS == null) {
+ try {
+ algTestCAS = GlobalCASSource.allocCAS(ae);
+ } catch (Exception e) {
+ e.printStackTrace();
+ return null;
+ }
+
+ }
+ return algTestCAS;
+ }
+
+ protected void saveParametersToTempFolder(Map<String, Object> params) {
+ if (createTempDirIfNeccessary()) {
+ String str = "\nSettings:\n\n";
+
+ str += "inputDir: " + inputDirectory;
+ str += "\ntempDir: " + tempDirectory;
+ str += "\npreprocessTMFile: " + preprocessorTMFile;
+ str += "\n";
+
+ for (Entry<String, Object> e : params.entrySet()) {
+ str += e.getKey() + " = " + e.getValue() + "\n";
+ }
+ if (createTempDirIfNeccessary())
+ TextRulerToolkit.appendStringToFile(tempDirectory() + "settings.txt", str);
+ }
+ }
+
+ public Set<String> getFilterSet() {
+ return filterSet;
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerBasicLearner.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,129 @@
+package org.apache.uima.tm.textruler.core;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+
+/**
+ *
+ * TextRulerExample encapsulates a single-slot, multi-slot or single-slot-boundary problem instance.
+ * This can be positive or negative examples from a example document, or it can be coverings of a
+ * rule or multiple rules that were applied to a document...
+ *
+ * hint: this could be renamed to MLInstance ?
+ *
+ */
+public class TextRulerExample {
+
+ protected TextRulerExampleDocument document;
+
+ protected TextRulerAnnotation annotations[]; // single-slot has only one
+
+ // annotation...
+ protected boolean isPositive;
+
+ protected TextRulerTarget target;
+
+ public TextRulerExample(TextRulerExampleDocument document, TextRulerAnnotation annotation,
+ boolean isPositive, TextRulerTarget target) {
+ TextRulerAnnotation singleAnnot[] = { annotation };
+ this.document = document;
+ this.isPositive = isPositive;
+ this.target = target;
+ this.annotations = singleAnnot;
+ }
+
+ public TextRulerExample(TextRulerExampleDocument document, TextRulerAnnotation annotations[],
+ boolean isPositive, TextRulerTarget target) {
+ this.document = document;
+ this.isPositive = isPositive;
+ this.target = target;
+ this.annotations = annotations;
+ }
+
+ public TextRulerExampleDocument getDocument() {
+ return document;
+ }
+
+ public CAS getDocumentCAS() {
+ return document.getCAS();
+ }
+
+ public TextRulerAnnotation getAnnotation() {
+ return annotations[0];
+ }
+
+ public TextRulerAnnotation[] getAnnotations() {
+ return annotations;
+ }
+
+ public boolean isPositive() {
+ return isPositive;
+ }
+
+ public void setPositive(boolean flag) {
+ isPositive = flag;
+ }
+
+ public TextRulerTarget getTarget() {
+ return target;
+ }
+
+ @Override
+ public String toString() {
+ if (target.type != MLTargetType.MULTI_SLOT) {
+ if (annotations != null) {
+ if (target.type == MLTargetType.SINGLE_WHOLE_SLOT)
+ return getAnnotation().getCoveredText();
+ else
+ return "START at " + getAnnotation().getBegin(); // +","+getAnnotation().getEnd();
+ } else
+ return "<no text>";
+ } else {
+ String str = "";
+ for (TextRulerAnnotation a : annotations) {
+ if (a == null)
+ str += "<NULL>";
+ else
+ str += a.getType().getShortName() + ":" + a.getCoveredText() + ";";
+ }
+ return str;
+ }
+ }
+
+ @Override
+ public boolean equals(Object ob) {
+ TextRulerExample o = (TextRulerExample) ob;
+
+ boolean result = document.getCasFileName().equals(o.document.getCasFileName())
+ && (isPositive == o.isPositive) && target.equals(o.target);
+
+ if (!result)
+ return false;
+
+ if (annotations.length != o.annotations.length)
+ return false;
+
+ for (int i = 0; i < annotations.length; i++) {
+ if (!annotations[i].equals(o.annotations[i]))
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = document.getCasFileName().hashCode()
+ * (isPositive ? 2 : 1)
+ * (target.type == MLTargetType.MULTI_SLOT ? 1
+ : (target.type == MLTargetType.SINGLE_WHOLE_SLOT ? 2
+ : (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? 3 : 4)));
+ int i = 1;
+ for (TextRulerAnnotation a : annotations) {
+ result *= i * (a.getBegin() + 1) * (a.getEnd() + 1);
+ i++;
+ }
+
+ return result;
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExample.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,338 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+
+/**
+ *
+ * TextRulerExampleDocument stands for one document usually loaded from an XMI file. It uses the
+ * given CasCache for storing its CAS with the XMI filename as the key.
+ *
+ * It holds ArrayLists for positive and negative MLExamples which can be filled on demand for a
+ * given learning target. E.g. single slot algorithms learn rules for each slot separately, so the
+ * work-flow is to clear the current examples and create new for the next slot target. The same is
+ * with single slot boundary algorithms like LP2: It first creates all left boundary examples,
+ * learns from them, clears the examples and creates the right boundary examples and so on.
+ *
+ * This class also provides the functionality extract and created MLExmaples of a given document or
+ * test CAS for a given TextRulerTarget.
+ *
+ * Especially for boundary algorithms you can call createBoundaryAnnotationsForCas to get boundary
+ * annotations at the beginnings and endings of an example slot.
+ *
+ * Caution (this is quite a bit inconvenient at the moment!): If a CAS gets loaded from the
+ * casCache, you have to call createBoundaryAnnotationsForCas again, so your casLoader must be aware
+ * of that (see BasicLP2 for an example) !
+ *
+ * hint: this could be renamed to MLDocument instead of TextRulerExampleDocument ?
+ */
+public class TextRulerExampleDocument {
+
+ protected String casFileName;
+
+ protected CasCache casCache;
+
+ protected List<TextRulerExample> positiveExamples = new ArrayList<TextRulerExample>();
+
+ protected List<TextRulerExample> negativeExamples = new ArrayList<TextRulerExample>();
+
+ public TextRulerExampleDocument(String casFileName, CasCache casCache) {
+ this.casCache = casCache;
+ this.casFileName = casFileName;
+ }
+
+ public CAS getCAS() {
+ // ask CACHE
+ return casCache.getCAS(casFileName);
+ }
+
+ public List<TextRulerExample> getPositiveExamples() {
+ return positiveExamples;
+ }
+
+ public List<TextRulerExample> getNegativeExamples() {
+ return negativeExamples;
+ }
+
+ protected void createPositiveExamplesForTarget(TextRulerTarget target) {
+ positiveExamples = createSlotInstancesForCAS(getCAS(), target, true);
+ }
+
+ public List<TextRulerExample> createSlotInstancesForCAS(CAS aCas, TextRulerTarget target,
+ boolean createFromRawTypeName) {
+ List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+
+ if (target.isMultiSlot()) {
+ TypeSystem ts = aCas.getTypeSystem();
+ int currentSlotIndex = 0;
+ TextRulerAnnotation[] currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+ List<Type> slotTypes = new ArrayList<Type>();
+ for (String s : target.slotNames)
+ slotTypes.add(ts.getType(s));
+
+ for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator(true); it.isValid(); it
+ .moveToNext()) {
+ AnnotationFS fs = (AnnotationFS) it.get();
+ Type theType = fs.getType();
+ if (slotTypes.contains(theType)) {
+ int idx = slotTypes.indexOf(theType);
+ if (idx < currentSlotIndex) // the previous example was not
+ // complete, so we have to write
+ // it down:
+ {
+ result.add(new TextRulerExample(this, currentAnnotations, true, target));
+ currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+ }
+ currentAnnotations[idx] = new TextRulerAnnotation(fs, this);
+ if (idx >= target.slotNames.length - 1) {
+ result.add(new TextRulerExample(this, currentAnnotations, true, target));
+ currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
+ currentSlotIndex = 0;
+ } else
+ currentSlotIndex = idx + 1;
+ }
+ }
+ if (currentSlotIndex > 0) {
+ result.add(new TextRulerExample(this, currentAnnotations, true, target));
+ }
+
+ } else if (target.isLeftCorrection() || target.isRightCorrection()) {
+ // TODO
+ TextRulerBasicLearner learner = target.getLearner();
+ Set<String> filterSet = learner.getFilterSet();
+ CAS testCAS = learner.getTestCAS();
+ TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
+ resetAndFillTestCAS(testCAS, target);
+ CAS docCAS = getCAS();
+ TypeSystem ts = docCAS.getTypeSystem();
+ Type tokensRootType = ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME);
+ AnalysisEngine analysisEngine = learner.getAnalysisEngine();
+ try {
+ analysisEngine.process(testCAS);
+ } catch (AnalysisEngineProcessException e) {
+ // TODO add log here
+ }
+ TextRulerTarget newTarget = new TextRulerTarget(target.slotNames, target.getLearner());
+ if (target.isLeftCorrection()) {
+ newTarget.type = TextRulerTarget.MLTargetType.SINGLE_LEFT_BOUNDARY;
+ } else {
+ newTarget.type = TextRulerTarget.MLTargetType.SINGLE_RIGHT_BOUNDARY;
+ }
+ createExamplesForTarget(newTarget);
+ learner.compareOriginalDocumentWithTestCAS(this, testCAS, newTarget, c, true);
+ List<TextRulerExample> correctTags = getPositiveExamples();
+ List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(c
+ .getCoveredNegativeExamples());
+ for (TextRulerExample wrongTag : wrongTags) {
+ // test, if there's a corresponding positive example
+ // somewhere around (within maxDistance)
+ List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag
+ .getAnnotation().getBegin(), target.getMaxShiftDistance(), TextRulerToolkit
+ .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);
+ List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag
+ .getAnnotation().getEnd(), target.getMaxShiftDistance() + 1, TextRulerToolkit
+ .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);
+
+ right.remove(0);
+
+ // TODO stop after the first found match or create one bad
+ // example for each found occurence ??!!
+ // for now: stop after one ! so create only ONE bad
+ // example...
+ int leftDistance = 0;
+ TextRulerExample leftCorrectTag = null;
+ for (int i = left.size() - 1; i >= 0; i--) {
+ leftDistance++;
+ TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i),
+ this, target, docCAS.getTypeSystem());
+ // Only checks the beginning of needle
+ leftCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
+ needle);
+ if (leftCorrectTag != null)
+ break;
+ }
+
+ int rightDistance = 0;
+ TextRulerExample rightCorrectTag = null;
+ for (AnnotationFS fs : right) {
+ rightDistance++;
+ TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, this, target,
+ docCAS.getTypeSystem());
+ // Only checks the beginning of needle
+ rightCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
+ needle);
+ if (rightCorrectTag != null)
+ break;
+ }
+
+ TextRulerExample theCorrectTag = null;
+ if (rightDistance < leftDistance && rightCorrectTag != null)
+ theCorrectTag = rightCorrectTag;
+ else if (rightDistance > leftDistance && leftCorrectTag != null)
+ theCorrectTag = leftCorrectTag;
+ else // use the one that would lie in the slot filler:
+ {
+ if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null)
+ theCorrectTag = rightCorrectTag;
+ else
+ theCorrectTag = leftCorrectTag;
+ }
+
+ if (theCorrectTag != null) {
+ TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!");
+ TextRulerShiftExample shiftExample = new TextRulerShiftExample(this, wrongTag
+ .getAnnotation(), theCorrectTag.getAnnotation(), true, target);
+ result.add(shiftExample);
+ }
+ }
+
+ } else {
+ List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas,
+ createFromRawTypeName ? target.getSingleSlotRawTypeName() : target
+ .getSingleSlotTypeName()); // do not use
+ // boundary type
+ // here since we
+ // seek for the
+ // orignial slot
+ // !
+ for (AnnotationFS a : slots) {
+ result.add(new TextRulerExample(this, TextRulerToolkit.convertToTargetAnnotation(a, this,
+ target, aCas.getTypeSystem()), true, target));
+ }
+ }
+ return result;
+ }
+
+ protected void createNegativeExamplesForTarget(TextRulerTarget target) {
+ // the default implementation does not support negative examples,
+ // subclasses can overwrite
+ // this if needed... or we could pass this as an argument to the
+ // constructor....
+ }
+
+ public void createExamplesForTarget(TextRulerTarget target) {
+ createPositiveExamplesForTarget(target);
+ createNegativeExamplesForTarget(target);
+ }
+
+ public void clearCurrentExamples() {
+ positiveExamples.clear();
+ negativeExamples.clear();
+ }
+
+ // pass your test CAS object and the corresponding learning target to get a
+ // filled
+ // test-CAS for testing e.g. rule or rule set..
+ // caution: testCas gets reset fist!
+ public void resetAndFillTestCAS(CAS testCas, TextRulerTarget target) {
+ testCas.reset();
+ CAS docCas = getCAS();
+ testCas.setDocumentText(docCas.getDocumentText());
+
+ // copy all annotations except the target-annotations:
+ TypeSystem ts = docCas.getTypeSystem();
+
+ List<Type> slotTypes = new ArrayList<Type>();
+
+ for (String s : target.getSlotTypeNames())
+ slotTypes.add(ts.getType(s));
+
+ if (target.isBoundary()) {
+ // add the base types (without START and END markers) also !
+ for (String s : target.slotNames)
+ slotTypes.add(ts.getType(s));
+ }
+
+ for (FSIterator<AnnotationFS> it = docCas.getAnnotationIndex().iterator(true); it.isValid(); it
+ .moveToNext()) {
+ AnnotationFS fs = it.get();
+ if (!slotTypes.contains(fs.getType())) {
+ Type t = testCas.getTypeSystem().getType(fs.getType().getName());
+ if (t != null) {
+ AnnotationFS createAnnotation = testCas.createAnnotation(t, fs.getBegin(), fs.getEnd());
+ testCas.addFsToIndexes(createAnnotation);
+ } else {
+ TextRulerToolkit.log("Type " + fs.getType().getName() + "is unknown in test CAS");
+ }
+ }
+ }
+ }
+
+ public String getCasFileName() {
+ return casFileName;
+ }
+
+ public static void createBoundaryAnnotationsForCas(CAS aCas, String slotName,
+ Set<String> tokenFilterSet) {
+ List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
+ TypeSystem ts = aCas.getTypeSystem();
+ for (AnnotationFS a : slots) {
+
+ List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
+ .getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
+ tokenFilterSet), ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME));
+ AnnotationFS first = slotTokens.get(0);
+ AnnotationFS last = slotTokens.get(slotTokens.size() - 1);
+ Type typeLB = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+ aCas.addFsToIndexes(aCas.createAnnotation(typeLB, first.getBegin(), first.getEnd()));
+ Type typeRB = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+ aCas.addFsToIndexes(aCas.createAnnotation(typeRB, last.getBegin(), last.getEnd()));
+ }
+ }
+
+ public static void removeBoundaryAnnotationsFromCas(CAS aCas, String slotName) {
+ // this method is not tested yet!
+ TypeSystem ts = aCas.getTypeSystem();
+ Type startType = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+ Type endType = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+ List<AnnotationFS> removeList = new ArrayList<AnnotationFS>();
+ for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(startType).iterator(true); it
+ .isValid(); it.moveToNext()) {
+ AnnotationFS fs = it.get();
+ removeList.add(fs);
+ }
+ for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(endType).iterator(true); it
+ .isValid(); it.moveToNext()) {
+ AnnotationFS fs = it.get();
+ removeList.add(fs);
+ }
+ for (AnnotationFS fs : removeList)
+ aCas.removeFsFromIndexes(fs);
+ }
+
+ public static synchronized TextRulerExample exampleListContainsAnnotation(
+ List<TextRulerExample> list, TextRulerAnnotation ann) {
+ TextRulerExample needle = new TextRulerExample(null, ann, true, null);
+
+ int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() {
+ public int compare(TextRulerExample o1, TextRulerExample o2) {
+ TextRulerAnnotation afs1 = o1.getAnnotation();
+ TextRulerAnnotation afs2 = o2.getAnnotation();
+ if (afs1.getBegin() < afs2.getBegin())
+ return -1;
+ else if (afs1.getBegin() > afs2.getBegin())
+ return 1;
+ else
+ return 0;
+ }
+ });
+ if (index >= 0)
+ return list.get(index);
+ else
+ return null;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocument.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,225 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+
+/**
+ *
+ * TextRulerExampleDocumentSet encapsulates an input set of documents, e.g. examples for a learning
+ * algorithm. It creates an instance of TextRulerExampleDocument for each found XMI file of the
+ * passed input folder
+ *
+ * For loading CASes you have to provide an CasCache. If you use TextRulerBasicLearner, this is done
+ * for you automatically.
+ *
+ * hint: this could be renamed to MLDocumentSet instead of TextRulerExampleDocumentSet ?
+ */
+public class TextRulerExampleDocumentSet {
+
+ protected List<TextRulerExampleDocument> documents;
+
+ protected CasCache casCache;
+
+ public TextRulerExampleDocumentSet(String xmiFolderName, CasCache casCache) {
+ super();
+ documents = new ArrayList<TextRulerExampleDocument>();
+ this.casCache = casCache;
+ File trainingFolder = new File(xmiFolderName);
+ File[] files = trainingFolder.listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return (name.endsWith(".xmi"));
+ }
+ });
+
+ for (File file : files) {
+ TextRulerToolkit.log("found document XMI file: " + file.getName());
+ documents.add(new TextRulerExampleDocument(file.getAbsolutePath(), casCache));
+ }
+ }
+
+ // for subset creations:
+ protected TextRulerExampleDocumentSet(String[] inputXmiFiles, CasCache casCache) {
+ super();
+ this.casCache = casCache;
+ documents = new ArrayList<TextRulerExampleDocument>();
+ for (String fileName : inputXmiFiles)
+ documents.add(new TextRulerExampleDocument(fileName, casCache));
+ }
+
+ public void createExamplesForTarget(TextRulerTarget target) {
+ TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder();
+ for (TextRulerExampleDocument doc : sortedDocs) {
+ doc.createExamplesForTarget(target);
+ }
+ }
+
+ public void clearCurrentExamples() {
+ for (TextRulerExampleDocument doc : documents)
+ doc.clearCurrentExamples();
+ }
+
+ public Collection<CAS> getCachedCASes() {
+ return casCache.getCachedCASes();
+ }
+
+ public boolean casCacheContainsKey(String key) {
+ return casCache.containsElementWithKey(key);
+ }
+
+ public List<TextRulerExample> getAllExamples() {
+ return getAllExamples(false);
+ }
+
+ public List<TextRulerExample> getAllPositiveExamples() {
+ return getAllExamples(true);
+ }
+
+ public List<TextRulerExample> getAllExamples(boolean onlyPositives) {
+ List<TextRulerExample> result = new ArrayList<TextRulerExample>();
+ for (TextRulerExampleDocument doc : documents) {
+ result.addAll(doc.getPositiveExamples());
+ if (!onlyPositives)
+ result.addAll(doc.getNegativeExamples());
+ }
+ return result;
+ }
+
+ public List<TextRulerExampleDocument> getDocuments() {
+ return documents;
+ }
+
+ public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder(
+ Collection<TextRulerExampleDocument> documents) {
+ Set<TextRulerExampleDocument> docsLeft = new HashSet<TextRulerExampleDocument>(documents);
+ TextRulerExampleDocument[] sortedDocs = new TextRulerExampleDocument[documents.size()];
+
+ // "sort" the currently cached documents to the front of the document
+ // list, so that
+ // we can use them directly and do not have to reload all docs everytime
+ // we come here!
+ int i = 0;
+ for (TextRulerExampleDocument doc : documents) {
+ if (casCacheContainsKey(doc.getCasFileName())) {
+ docsLeft.remove(doc);
+ sortedDocs[i] = doc;
+ i++;
+ }
+ }
+ for (TextRulerExampleDocument doc : docsLeft) {
+ sortedDocs[i] = doc;
+ i++;
+ }
+ if (TextRulerToolkit.DEBUG) {
+ TextRulerToolkit.logIf(i != documents.size(), "ERROR, SIZE MISMATCH!");
+ }
+
+ return sortedDocs;
+ }
+
+ public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder() {
+ return getSortedDocumentsInCacheOptimizedOrder(documents);
+ }
+
+ public List<Integer> getTokenCountHistogrammForSlotName(String slotName, Set<String> filterSet) {
+ HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
+ int maxLen = 0;
+
+ TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder(documents);
+
+ for (TextRulerExampleDocument doc : sortedDocs) {
+ CAS aCas = doc.getCAS();
+ List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
+ TypeSystem ts = aCas.getTypeSystem();
+ for (AnnotationFS a : slots) {
+
+ List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
+ .getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
+ filterSet), ts.getType(TextRulerToolkit.TM_ANY_TYPE_NAME));
+ int len = slotTokens.size();
+ if (len > maxLen)
+ maxLen = len;
+ Integer key = new Integer(len);
+ int current = map.containsKey(key) ? map.get(key) : 0;
+ map.put(key, len + current);
+ }
+ }
+ List<Integer> resultList = new ArrayList<Integer>(maxLen + 1);
+ for (int i = 0; i <= maxLen; i++) {
+ int value = map.containsKey(i) ? map.get(i) : 0;
+ resultList.add(value);
+ }
+ return resultList;
+ }
+
+ public CAS getCAS(String key) {
+ return casCache.getCAS(key);
+ }
+
+ public int size() {
+ return documents.size();
+ }
+
+ public TextRulerExampleDocument getDocumentForFileName(String fileName) {
+ for (TextRulerExampleDocument doc : documents)
+ if (doc.getCasFileName().equals(fileName))
+ return doc;
+ return null;
+ }
+
+ // TODO this is not tested yet!
+ public List<TextRulerExampleDocumentSet> partitionIntoSubsets(int[] percentages) {
+ List<TextRulerExampleDocumentSet> result = new ArrayList<TextRulerExampleDocumentSet>();
+
+ int sum = 0;
+ for (int p : percentages) {
+ if (p == 0) {
+ TextRulerToolkit
+ .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero!");
+ return null;
+ }
+ sum += p;
+ }
+ if (sum != 100) {
+ TextRulerToolkit
+ .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] percentages has to be 100 in total!");
+ return null;
+ }
+
+ int rest = size();
+ int docIndex = 0;
+
+ for (int i = 0; i < percentages.length; i++) {
+ int partSize;
+ if (i == percentages.length - 1) {
+ partSize = Math.round((((percentages[i] * size()) / 100.0f)));
+ if (partSize == 0)
+ partSize = 1;
+ } else
+ partSize = rest;
+
+ if (partSize == 0) {
+ TextRulerToolkit
+ .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero! too few example documents for your partition?");
+ return null;
+ }
+ String[] fileNames = new String[partSize];
+ for (int doc = 0; doc < partSize; doc++)
+ fileNames[doc] = documents.get(doc + docIndex).getCasFileName();
+ docIndex += partSize;
+ result.add(new TextRulerExampleDocumentSet(fileNames, casCache));
+ rest -= partSize;
+ }
+ return result;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerExampleDocumentSet.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,106 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem.MLRuleItemType;
+
+
+/**
+ *
+ * TextRulerMultiSlotRule adds multi-slot specific stuff to the basic class TextRulerRule.
+ *
+ * A multi-slot-rule consists of an TextRulerSlotPattern for each slot which of each consists of
+ * three patterns: prefiller, filler an postfiller (see TextRulerSlotPattern).
+ *
+ */
+public class TextRulerMultiSlotRule extends TextRulerRule {
+
+ protected List<TextRulerSlotPattern> slotPatterns = new ArrayList<TextRulerSlotPattern>();
+
+ public TextRulerMultiSlotRule(TextRulerMultiSlotRule copyFrom) {
+ super(copyFrom);
+
+ for (TextRulerSlotPattern origP : copyFrom.slotPatterns)
+ slotPatterns.add(origP.copy());
+ }
+
+ public String getMarkName(int slotIndex) {
+ return TextRulerToolkit.getTypeShortName(target.getMultiSlotTypeName(slotIndex));
+ }
+
+ public TextRulerMultiSlotRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+ super(parentAlgorithm, target);
+ }
+
+ protected String getInterslotWildCard() {
+ return "ALL*? ";
+ }
+
+ @Override
+ public void compileRuleString() {
+ String ruleString = "";
+
+ int totalSize = 0;
+ int totalIndex = 0;
+ int interSlotWildcards = slotPatterns.size() - 1;
+ if (interSlotWildcards < 0)
+ interSlotWildcards = 0;
+ for (TextRulerSlotPattern sp : slotPatterns) {
+ totalSize += sp.preFillerPattern.size();
+ totalSize += sp.fillerPattern.size();
+ totalSize += sp.postFillerPattern.size();
+ }
+ totalSize += interSlotWildcards;
+
+ for (int slotIndex = 0; slotIndex < slotPatterns.size(); slotIndex++) {
+ TextRulerSlotPattern sPattern = slotPatterns.get(slotIndex);
+ int index = 0;
+ for (TextRulerRuleItem item : sPattern.preFillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.PREFILLER, index,
+ sPattern.preFillerPattern.size(), totalIndex, totalSize, slotIndex)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+ index = 0;
+ for (TextRulerRuleItem item : sPattern.fillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.FILLER, index,
+ sPattern.fillerPattern.size(), totalIndex, totalSize, slotIndex)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+
+ index = 0;
+ for (TextRulerRuleItem item : sPattern.postFillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.POSTFILLER, index,
+ sPattern.postFillerPattern.size(), totalIndex, totalSize, slotIndex)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+
+ if (slotPatterns.size() > 1 && slotIndex < slotPatterns.size() - 1) {
+ // add interslot wildcard:
+ ruleString += getInterslotWildCard();
+ totalIndex++;
+ }
+ }
+
+ ruleString = ruleString.trim();
+ ruleString += ";";
+ this.ruleString = ruleString;
+ setNeedsCompile(false);
+ }
+
+ public List<TextRulerSlotPattern> getPatterns() {
+ return slotPatterns;
+ }
+
+ @Override
+ public TextRulerMultiSlotRule copy() {
+ return new TextRulerMultiSlotRule(this);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerMultiSlotRule.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,107 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+
+import org.apache.uima.util.FileUtils;
+
+/**
+ *
+ * TextRulerRule is the basic class for any kind of TextMarker-Rule representation for any learning
+ * algorithm. A rule usually has a parent algorithm (that created it) and a learning target
+ * (TextRulerTarget).
+ *
+ * The subclasses TextRulerSingleSlotRule and TextRulerMultiSlotRule add slot specific issues to it
+ * and every algorithm then has to subclass on of those two and provide a class that implements
+ * TextRulerRuleItem.
+ *
+ */
+public abstract class TextRulerRule {
+
+ protected TextRulerBasicLearner algorithm;
+
+ protected boolean needsCompile = true;
+
+ protected String ruleString;
+
+ protected TextRulerTarget target;
+
+ protected TextRulerStatisticsCollector coveringStatistics;
+
+ // copy constructor:
+ public TextRulerRule(TextRulerRule copyFrom) {
+ algorithm = copyFrom.algorithm;
+ needsCompile = copyFrom.needsCompile;
+ ruleString = copyFrom.ruleString;
+ target = new TextRulerTarget(copyFrom.target, algorithm);
+ coveringStatistics = copyFrom.coveringStatistics != null ? copyFrom.coveringStatistics.copy()
+ : null;
+ }
+
+ public TextRulerRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+ super();
+ algorithm = parentAlgorithm;
+ this.target = target;
+ }
+
+ public String getRuleString() {
+ if (needsCompile)
+ compileRuleString();
+ return ruleString;
+ }
+
+ public TextRulerTarget getTarget() {
+ return target;
+ }
+
+ public abstract void compileRuleString();
+
+ public void setNeedsCompile(boolean flag) {
+ needsCompile = flag;
+ }
+
+ protected String getRulesFileContent() {
+ return algorithm.getTMFileHeaderString() + getRuleString() + "\n";
+ }
+
+ public void saveToRulesFile(String filename) {
+ File file = new File(filename);
+ String str = getRulesFileContent();
+ try {
+ FileUtils.saveString2File(str, file);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return this.getRuleString().equals(((TextRulerRule) obj).getRuleString());
+ }
+
+ @Override
+ public int hashCode() {
+ return this.getRuleString().hashCode();
+ }
+
+ public void setCoveringStatistics(TextRulerStatisticsCollector c) {
+ coveringStatistics = c.copy();
+ }
+
+ public TextRulerStatisticsCollector getCoveringStatistics() {
+ return coveringStatistics;
+ }
+
+ public abstract TextRulerRule copy();
+
+ @Override
+ public String toString() {
+ return ruleString == null ? "<not compiled>" : ruleString; // don't
+ // compile if
+ // neccessary!
+ // just
+ // return the
+ // current
+ // rulestring!
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRule.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,27 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ *
+ * Every rule pattern (TextRulerRulePattern) consists of items. Those have to implement the
+ * interface TextRulerRuleItem.
+ *
+ * hint: maybe we should change this to an abstract class instead of an interface ?!
+ */
+public interface TextRulerRuleItem {
+
+ public enum MLRuleItemType {
+ PREFILLER, FILLER, POSTFILLER
+ };
+
+ public String getStringForRuleString(TextRulerRule rule, MLRuleItemType type,
+ int numberInPattern, int patternSize, int numberInRule, int ruleSize, int slotIndex);
+
+ public boolean equals(TextRulerRuleItem o);
+
+ public TextRulerRuleItem copy();
+
+ public String toString();
+
+ public int hashCode();
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleItem.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,58 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import org.apache.uima.util.FileUtils;
+
+/**
+ *
+ * TextRulerRuleList can hold a list of rules and provides some extra functionality like saving them
+ * to a TextMarker rule file...
+ *
+ */
+public class TextRulerRuleList extends ArrayList<TextRulerRule> {
+
+ private static final long serialVersionUID = 1L;
+
+ public void saveToRulesFile(String filename, String fileHeader) {
+ File file = new File(filename);
+ try {
+ FileUtils.saveString2File(getTMFileString(fileHeader), file);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ public boolean addRule(TextRulerRule rule) {
+ if (!this.contains(rule)) {
+ this.add(rule);
+ return true;
+ }
+ return false;
+ }
+
+ public String getRulesString(String linePrefix) {
+ return getRulesString(linePrefix, Integer.MAX_VALUE);
+ }
+
+ public String getRulesString(String linePrefix, int maxRuleStringLength) {
+ StringBuffer str = new StringBuffer();
+ for (TextRulerRule rule : this) {
+ String theRuleString = rule.getRuleString();
+ String rStr = theRuleString.length() > maxRuleStringLength ? "<too long to display>"
+ : theRuleString;
+ str.append(linePrefix + rStr + "\t// " + rule.getCoveringStatistics() + "\n");
+ }
+ return str.toString();
+ }
+
+ public String getTMFileString(String header) {
+ return header + getRulesString("", Integer.MAX_VALUE);
+ }
+
+ public String getTMFileString(String header, int maxRuleStringLength) {
+ return header + getRulesString("", maxRuleStringLength);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRuleList.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,80 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.ArrayList;
+
+/**
+ *
+ * TextRulerRulePattern is an ordered list of rule items and provides some special functionality for
+ * dealing with rule patterns like finding sub patterns or such.
+ *
+ * hint: this is a very basic implementation and could surely be optimized ;-)
+ */
+public class TextRulerRulePattern extends ArrayList<TextRulerRuleItem> {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public String toString() {
+ String result = "";
+ for (TextRulerRuleItem i : this)
+ result += " " + i;
+ return result.trim();
+ }
+
+ public int find(TextRulerRulePattern subPattern) {
+ if (subPattern.size() == 0)
+ return -1;
+ if (size() < subPattern.size())
+ return -1;
+ int maxIndex = size() - subPattern.size();
+ for (int i = 0; i <= maxIndex; i++)
+ if (get(i).toString().equals(subPattern.get(0).toString())) {
+ // test the rest of the items:
+ boolean isEqual = true;
+ for (int i2 = 1; i2 < subPattern.size(); i2++) {
+ if (!get(i + i2).toString().equals(subPattern.get(i2).toString())) {
+ isEqual = false;
+ break;
+ }
+ }
+ if (isEqual)
+ return i;
+
+ }
+ return -1;
+ }
+
+ public TextRulerRulePattern subPattern(int start, int length) {
+ TextRulerRulePattern result = new TextRulerRulePattern();
+ if (length < 0)
+ length = size();
+ for (int i = 0; i < length; i++) {
+ int theIndex = start + i;
+ if (theIndex >= size())
+ break;
+ result.add(get(theIndex));
+ }
+ return result;
+ }
+
+ public TextRulerRulePattern copy() {
+ TextRulerRulePattern result = new TextRulerRulePattern();
+ for (TextRulerRuleItem i : this)
+ result.add(i.copy());
+ return result;
+ }
+
+ public TextRulerRuleItem lastItem() {
+ if (size() > 0)
+ return get(size() - 1);
+ else
+ return null;
+ }
+
+ public TextRulerRuleItem firstItem() {
+ if (size() > 0)
+ return get(0);
+ else
+ return null;
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerRulePattern.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,41 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ *
+ * This class introduces the special additional information of an example for learning
+ * LP2-Correction Rules. It needs besides the original, correct slot boundary annotation the wrong
+ * annotation, or the other way round, it needs the correct annotation where to shift the boundary
+ * tag.
+ *
+ * Since TextRulerExample provides a possibility to store more than one Annotation for
+ * MultiSlot-Exmaples, we easily can use this storage for those two annotations. But for the sake of
+ * code reading and better understanding, this additional class with named accessors is created.
+ *
+ */
+public class TextRulerShiftExample extends TextRulerExample {
+
+ public TextRulerShiftExample(TextRulerExampleDocument document,
+ TextRulerAnnotation wrongAnnotation, TextRulerAnnotation correctAnnotation,
+ boolean isPositive, TextRulerTarget target) {
+ super(document, (TextRulerAnnotation[]) null, isPositive, target);
+ annotations = new TextRulerAnnotation[2];
+ annotations[0] = wrongAnnotation;
+ annotations[1] = correctAnnotation;
+ }
+
+ public TextRulerAnnotation wrongAnnotation() {
+ return annotations[0];
+ }
+
+ public TextRulerAnnotation correctAnnotation() {
+ return annotations[1];
+ }
+
+ @Override
+ public String toString() {
+ String wrongStr = "" + wrongAnnotation().getBegin();
+ String correctStr = "" + correctAnnotation().getBegin();
+ return wrongStr + " --> " + correctStr;
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerShiftExample.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,164 @@
+package org.apache.uima.tm.textruler.core;
+
+import org.apache.uima.tm.textruler.core.TextRulerRuleItem.MLRuleItemType;
+import org.apache.uima.tm.textruler.core.TextRulerTarget.MLTargetType;
+
+/**
+ *
+ * TextRulerSingleSlotRule adds single-slot specific stuff to the basic class TextRulerRule.
+ *
+ * A single-slot-rule consists of one TextRulerSlotPattern which each consists of three patterns:
+ * prefiller, filler an postfiller (see TextRulerSlotPattern).
+ *
+ */
+public class TextRulerSingleSlotRule extends TextRulerRule {
+
+ protected TextRulerSlotPattern slotPattern = new TextRulerSlotPattern();
+
+ public TextRulerSingleSlotRule(TextRulerBasicLearner parentAlgorithm, TextRulerTarget target) {
+ super(parentAlgorithm, target);
+ }
+
+ public TextRulerSingleSlotRule(TextRulerSingleSlotRule copyFrom) {
+ super(copyFrom);
+ slotPattern = copyFrom.slotPattern.copy();
+ }
+
+ public TextRulerRulePattern getPreFillerPattern() {
+ return slotPattern.preFillerPattern;
+ }
+
+ public TextRulerRulePattern getFillerPattern() {
+ return slotPattern.fillerPattern;
+ }
+
+ public TextRulerRulePattern getPostFillerPattern() {
+ return slotPattern.postFillerPattern;
+ }
+
+ public String getMarkName() {
+ return TextRulerToolkit.getTypeShortName(target.getSingleSlotTypeName());
+ }
+
+ @Override
+ public void compileRuleString() {
+ String ruleString = "";
+ int preCount = slotPattern.preFillerPattern.size();
+ int postCount = slotPattern.postFillerPattern.size();
+ int fillerCount = slotPattern.fillerPattern.size();
+ int totalSize = preCount + postCount + fillerCount;
+ int index = 0;
+ int totalIndex = 0;
+ for (TextRulerRuleItem item : slotPattern.preFillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.PREFILLER, index, preCount,
+ totalIndex, totalSize, 0)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+
+ index = 0;
+ for (TextRulerRuleItem item : slotPattern.fillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.FILLER, index, fillerCount,
+ totalIndex, totalSize, 0)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+
+ index = 0;
+ for (TextRulerRuleItem item : slotPattern.postFillerPattern) {
+ ruleString += item.getStringForRuleString(this, MLRuleItemType.POSTFILLER, index, postCount,
+ totalIndex, totalSize, 0)
+ + " ";
+ index++;
+ totalIndex++;
+ }
+
+ ruleString = ruleString.trim();
+ ruleString += ";";
+ this.ruleString = ruleString;
+ setNeedsCompile(false);
+ }
+
+ public void addPreFillerItem(TextRulerRuleItem item) {
+ slotPattern.preFillerPattern.add(0, item);
+ setNeedsCompile(true);
+ }
+
+ public void addPreFillerItemWithNormalOrder(TextRulerRuleItem item) {
+ slotPattern.preFillerPattern.add(item);
+ setNeedsCompile(true);
+ }
+
+ public void addPostFillerItem(TextRulerRuleItem item) {
+ slotPattern.postFillerPattern.add(item);
+ setNeedsCompile(true);
+ }
+
+ public void addFillerItem(TextRulerRuleItem item) {
+ if (target.type == MLTargetType.SINGLE_WHOLE_SLOT) {
+ slotPattern.fillerPattern.add(item);
+ setNeedsCompile(true);
+ } else {
+ new Exception("[TextRulerRule] BOUNDARY SLOT RULES CANNOT HAVE FILLER ITEMS!");
+ }
+ }
+
+ public TextRulerRuleItem getOutermostPreFillerItem() {
+ if (slotPattern.preFillerPattern.size() == 0)
+ return null;
+ else
+ return slotPattern.preFillerPattern.get(0);
+ }
+
+ public TextRulerRuleItem getOutermostPostFillerItem() {
+ if (slotPattern.postFillerPattern.size() == 0)
+ return null;
+ else
+ return slotPattern.postFillerPattern.get(slotPattern.postFillerPattern.size() - 1);
+ }
+
+ public void removeOutermostPreFillerItem() {
+ if (slotPattern.preFillerPattern.size() > 0) {
+ slotPattern.preFillerPattern.remove(0);
+ setNeedsCompile(true);
+ }
+ }
+
+ public void removeOutermostPostFillerItem() {
+ if (slotPattern.postFillerPattern.size() > 0) {
+ slotPattern.postFillerPattern.remove(slotPattern.postFillerPattern.size() - 1);
+ setNeedsCompile(true);
+ }
+ }
+
+ public TextRulerRuleItem getRuleItemWithIndex(int index) {
+ int i = index;
+ if (i < slotPattern.preFillerPattern.size())
+ return slotPattern.preFillerPattern.get(i);
+ else
+ i -= slotPattern.preFillerPattern.size();
+
+ if (i < slotPattern.fillerPattern.size())
+ return slotPattern.fillerPattern.get(i);
+ else
+ i -= slotPattern.fillerPattern.size();
+
+ if (i < slotPattern.postFillerPattern.size())
+ return slotPattern.postFillerPattern.get(i);
+ else
+ return null;
+ }
+
+ public int totalItemCount() {
+ return slotPattern.preFillerPattern.size() + slotPattern.fillerPattern.size()
+ + slotPattern.postFillerPattern.size();
+ }
+
+ @Override
+ public TextRulerSingleSlotRule copy() {
+ return new TextRulerSingleSlotRule(this);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSingleSlotRule.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,35 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ *
+ * TextRulerSlotPattern holds the three slot patterns we usually have for a TextMarker-Rule: a
+ * preFiller pattern, a filler pattern and a post filler pattern
+ *
+ */
+public class TextRulerSlotPattern {
+
+ public TextRulerRulePattern preFillerPattern = new TextRulerRulePattern();
+
+ public TextRulerRulePattern fillerPattern = new TextRulerRulePattern();
+
+ public TextRulerRulePattern postFillerPattern = new TextRulerRulePattern();
+
+ public TextRulerSlotPattern() {
+ super();
+ }
+
+ public TextRulerSlotPattern(TextRulerSlotPattern copyFrom) {
+ super();
+ for (TextRulerRuleItem i : copyFrom.preFillerPattern)
+ preFillerPattern.add(i.copy());
+ for (TextRulerRuleItem i : copyFrom.fillerPattern)
+ fillerPattern.add(i.copy());
+ for (TextRulerRuleItem i : copyFrom.postFillerPattern)
+ postFillerPattern.add(i.copy());
+ }
+
+ public TextRulerSlotPattern copy() {
+ return new TextRulerSlotPattern(this);
+ }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerSlotPattern.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,126 @@
+package org.apache.uima.tm.textruler.core;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ *
+ * TextRulerStatisticsCollector is used for collecting data while e.g. testing a rule.
+ *
+ * It holds a HashMap for the covered positive and negative MLExamples of e.g. the training
+ * documents, counts the true positives (coveredPositives), false positives (coveredNegatives), ...
+ *
+ * Currently false negatives (missedPositives) are not counted (see
+ * TextRulerExampleDocument.compareOriginalDocumentWithTestCAS) but this functionality can be easily
+ * added (since it exists as commented out code)
+ *
+ */
+public class TextRulerStatisticsCollector {
+
+ protected int p = 0; // covered positive examples (true positives)
+
+ protected int n = 0; // covered negative examples (false positives)
+
+ // protected int missedPositives = 0; // (false negatives)
+
+ protected Set<TextRulerExample> coveredPositives = new HashSet<TextRulerExample>();
+
+ protected Set<TextRulerExample> coveredNegatives = new HashSet<TextRulerExample>();
+
+ public TextRulerStatisticsCollector() {
+ super();
+ }
+
+ public TextRulerStatisticsCollector(TextRulerStatisticsCollector c) {
+ p = c.p;
+ n = c.n;
+ coveredPositives.addAll(c.coveredPositives);
+ coveredNegatives.addAll(c.coveredNegatives);
+ }
+
+ public int getTotalCoveredExamples() {
+ return p + n;
+ }
+
+ public int getCoveredPositivesCount() {
+ if (TextRulerToolkit.DEBUG && p != coveredPositives.size()) {
+ TextRulerToolkit.log("WHY is P different from coveredPositives.size() ??");
+ }
+ return p;
+ }
+
+ public int getCoveredNegativesCount() {
+ return n;
+ }
+
+ public Set<TextRulerExample> getCoveredPositiveExamples() {
+ return coveredPositives;
+ }
+
+ public Set<TextRulerExample> getCoveredNegativeExamples() {
+ return coveredNegatives;
+ }
+
+ // public int getMissedPositives()
+ // {
+ // return missedPositives;
+ // }
+
+ public void reflectCountsFromCoveredExamples() {
+ p = coveredPositives.size();
+ n = coveredNegatives.size();
+ }
+
+ public void reset() {
+ p = 0;
+ n = 0;
+ // missedPositives = 0 ;
+ // coveredDocuments = 0;
+ coveredPositives.clear();
+ coveredNegatives.clear();
+ }
+
+ public void incCoveredPositives(int count) {
+ p += count;
+ }
+
+ public void incCoveredNegatives(int count) {
+ n += count;
+ }
+
+ public void addCoveredPositive(TextRulerExample e) {
+ if (coveredPositives.add(e))
+ incCoveredPositives(1);
+ else
+ TextRulerToolkit.logIfDebug("TRIED TO ADD A POSITIVE COVERED EXAMPLE TWICE !!");
+ }
+
+ public void addCoveredNegative(TextRulerExample e) {
+ if (coveredNegatives.add(e))
+ incCoveredNegatives(1);
+ else
+ TextRulerToolkit.logIfDebug("TRIED TO ADD A NEGATIVE COVERED EXAMPLE TWICE !!");
+ }
+
+ public TextRulerStatisticsCollector copy() {
+ return new TextRulerStatisticsCollector(this);
+ }
+
+ public void add(TextRulerStatisticsCollector c) {
+ incCoveredNegatives(c.n);
+ incCoveredPositives(c.p);
+ coveredPositives.addAll(c.coveredPositives);
+ coveredNegatives.addAll(c.coveredNegatives);
+ }
+
+ @Override
+ public String toString() {
+ return "p=" + p + "; n=" + n;
+ }
+
+ // public void incCoveredMissedPositives(int count)
+ // {
+ // missedPositives += count;
+ // }
+
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerStatisticsCollector.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java?rev=1152792&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java (added)
+++ uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java Mon Aug 1 14:21:12 2011
@@ -0,0 +1,160 @@
+package org.apache.uima.tm.textruler.core;
+
+/**
+ *
+ * TextRulerTarget encapsulates a learning target of an ML-algorithm. It currently can be
+ *
+ * * multi slot (full slots, not boundary!) * single slot (full slot) * single boundary (single slot
+ * left or right boundary)
+ *
+ * An TextRulerRule for example is induced for an TextRulerTarget or a testCAS for a given
+ * TextRulerExampleDocument is created especially for a given target.
+ *
+ */
+public class TextRulerTarget {
+
+ public enum MLTargetType {
+ MULTI_SLOT, SINGLE_WHOLE_SLOT, SINGLE_LEFT_BOUNDARY, SINGLE_RIGHT_BOUNDARY, SINGLE_LEFT_CORRECTION, SINGLE_RIGHT_CORRECTION
+ };
+
+ public String slotNames[];
+
+ public MLTargetType type;
+
+ private TextRulerBasicLearner learner;
+
+ private int maxShiftDistance = 0;
+
+ // copy constructor
+ public TextRulerTarget(TextRulerTarget copyFrom, TextRulerBasicLearner owner) {
+ this.slotNames = copyFrom.slotNames.clone();
+ this.type = copyFrom.type;
+ this.learner = owner;
+ }
+
+ public TextRulerTarget(String slotNames[], TextRulerBasicLearner owner) {
+ this.slotNames = slotNames;
+ type = MLTargetType.MULTI_SLOT;
+ this.learner = owner;
+ }
+
+ public TextRulerTarget(String slotName, TextRulerBasicLearner owner) {
+ slotNames = new String[1];
+ slotNames[0] = slotName;
+ type = MLTargetType.SINGLE_WHOLE_SLOT;
+ this.learner = owner;
+ }
+
+ public TextRulerBasicLearner getLearner() {
+ return learner;
+ }
+
+ public TextRulerTarget(String slotName, MLTargetType type, TextRulerBasicLearner owner) {
+ slotNames = new String[1];
+ slotNames[0] = slotName;
+ this.type = type;
+ this.learner = owner;
+ }
+
+ public String getMultiSlotTypeName(int slotIndex) {
+ return slotNames[slotIndex];
+ }
+
+ public String getSingleSlotRawTypeName() {
+ return slotNames[0];
+ }
+
+ public String getSingleSlotTypeName() {
+ return getSingleSlotTypeName(type, slotNames[0]);
+ // if (type == MLTargetType.MULTI_SLOT)
+ // return null;
+ // if (type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ // return slotNames[0]+TextRulerToolkit.LEFT_BOUNDARY_EXTENSION;
+ // else if (type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ // return slotNames[0]+TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION;
+ // else
+ // return slotNames[0];
+ }
+
+ public static String getSingleSlotTypeName(MLTargetType t, String slotName) {
+ if (t == MLTargetType.MULTI_SLOT)
+ return null;
+ if (t == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ return slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION;
+ else if (t == MLTargetType.SINGLE_RIGHT_BOUNDARY)
+ return slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION;
+ else
+ return slotName;
+ }
+
+ public String[] getRawSlotNames() {
+ return slotNames;
+ }
+
+ public String[] getSlotTypeNames() {
+ if (type == MLTargetType.MULTI_SLOT)
+ return slotNames;
+ else {
+ String result[] = new String[1];
+ result[0] = getSingleSlotTypeName();
+ return result;
+ }
+ }
+
+ public boolean isMultiSlot() {
+ return type == MLTargetType.MULTI_SLOT;
+ }
+
+ public boolean isBoundary() {
+ return type == MLTargetType.SINGLE_LEFT_BOUNDARY || type == MLTargetType.SINGLE_RIGHT_BOUNDARY;
+ }
+
+ public boolean isLeftBoundary() {
+ return type == MLTargetType.SINGLE_LEFT_BOUNDARY;
+ }
+
+ public boolean isRightBoundary() {
+ return type == MLTargetType.SINGLE_RIGHT_BOUNDARY;
+ }
+
+ public boolean isLeftCorrection() {
+ return type == MLTargetType.SINGLE_LEFT_CORRECTION;
+ }
+
+ public boolean isRightCorrection() {
+ return type == MLTargetType.SINGLE_RIGHT_CORRECTION;
+ }
+
+ public TextRulerTarget getCounterPartBoundaryTarget() {
+ if (!isBoundary())
+ return null;
+ else if (type == MLTargetType.SINGLE_LEFT_BOUNDARY)
+ return new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_RIGHT_BOUNDARY, learner);
+ else
+ return new TextRulerTarget(slotNames[0], MLTargetType.SINGLE_LEFT_BOUNDARY, learner);
+ }
+
+ public int getMaxShiftDistance() {
+ return maxShiftDistance;
+ }
+
+ public void setMaxShiftDistance(int maxShiftDistance) {
+ this.maxShiftDistance = maxShiftDistance;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ TextRulerTarget t = (TextRulerTarget) o;
+ if (t.slotNames.length != slotNames.length)
+ return false;
+ for (int i = 0; i < slotNames.length; i++)
+ if (!slotNames[i].equals(t.slotNames[i]))
+ return false;
+ return type == t.type;
+ }
+
+ @Override
+ public int hashCode() {
+ return slotNames.hashCode() * type.hashCode();
+ }
+}
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: uima/sandbox/trunk/TextMarker/org.apache.uima.tm.textruler/src/main/java/org/apache/uima/tm/textruler/core/TextRulerTarget.java
------------------------------------------------------------------------------
svn:mime-type = text/plain