You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2013/06/06 16:49:05 UTC
svn commit: r1490307 - in
/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler:
core/ extension/ learner/lp2/ tools/
Author: pkluegl
Date: Thu Jun 6 14:49:04 2013
New Revision: 1490307
URL: http://svn.apache.org/r1490307
Log:
UIMA-2344
- support boundaries by expanding type systems
Removed:
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/BatchRuleEvaluator.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/BatchRuleScorer.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/F1Scorer.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/SGMLToXMIConverter.java
Modified:
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java
uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java Thu Jun 6 14:49:04 2013
@@ -74,7 +74,7 @@ public class TextRulerAnnotation {
}
public TextRulerAnnotation(AnnotationFS afs, TextRulerExampleDocument document) {
- this(afs, null, null);
+ this(afs, document, null);
}
public TextRulerAnnotation(AnnotationFS afs) {
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java Thu Jun 6 14:49:04 2013
@@ -30,10 +30,17 @@ import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASRuntimeException;
+import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.TypeDescription;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.resource.metadata.impl.TypeDescription_impl;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
@@ -86,6 +93,8 @@ public abstract class TextRulerBasicLear
private boolean configChanged = false;
+ protected boolean supportBoundaries = false;
+
public TextRulerBasicLearner(String inputDir, String prePropTMFile, String tmpDir,
String[] slotNames, Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super();
@@ -112,9 +121,9 @@ public abstract class TextRulerBasicLear
useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.BREAK");
useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.NBSP");
useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.MARKUP");
-
+
configChanged = true;
-
+
this.casCache = new CasCache(100, this); // TODO make size configurable
// !? share e.g. 100 places for
// all running algoritghms ?
@@ -142,7 +151,13 @@ public abstract class TextRulerBasicLear
String descriptorFile = TextRulerToolkit.getEngineDescriptorFromTMSourceFile(new Path(
preprocessorTMFile));
sendStatusUpdateToDelegate("loading AE...", TextRulerLearnerState.ML_INITIALIZING, false);
- ae = TextRulerToolkit.loadAnalysisEngine(descriptorFile);
+
+ AnalysisEngineDescription description = TextRulerToolkit.getAnalysisEngineDescription(descriptorFile);
+ if(supportBoundaries) {
+ TextRulerToolkit.addBoundaryTypes(description, slotNames);
+ }
+
+ ae = TextRulerToolkit.loadAnalysisEngine(description);
// set filters to NO filtering so that we can add it manually with
// the FILTERTYPE expression!
@@ -154,17 +169,19 @@ public abstract class TextRulerBasicLear
ae.setConfigParameterValue(RutaEngine.ADDITIONAL_SCRIPTS, new String[0]);
ae.setConfigParameterValue(RutaEngine.RELOAD_SCRIPT, true);
ae.setConfigParameterValue(RutaEngine.REMOVE_BASICS, true);
- if(useDynamicAnchoring) {
+ if (useDynamicAnchoring) {
ae.setConfigParameterValue(RutaEngine.DYNAMIC_ANCHORING, true);
}
-
try {
ae.reconfigure();
} catch (ResourceConfigurationException e) {
TextRulerPlugin.error(e);
}
+ configChanged = true;
}
+
+
protected boolean checkForMandatoryTypes() {
// check if all passed slot types are present:
CAS someCas = getTestCAS();
@@ -442,11 +459,36 @@ public abstract class TextRulerBasicLear
}
public String getFileHeaderString(boolean complete) {
- return getPackageString() + getTypeSystemImport(complete) + getFilterCommandString() + getUseDynamicAnchoring(complete);
+ return getPackageString() + getTypeSystemImport(complete) + getFilterCommandString()
+ + getUseDynamicAnchoring(complete) + getBoundaryDeclarations(complete);
+ }
+
+ private String getBoundaryDeclarations(boolean complete) {
+ if (complete && supportBoundaries && slotNames.length > 0) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("DECLARE ");
+ int count = 0;
+ for (String slot : slotNames) {
+ String[] split = slot.split("[.]");
+ String shortName = split[split.length - 1];
+ sb.append(shortName);
+ sb.append("START");
+ sb.append(", ");
+ sb.append(shortName);
+ sb.append("END");
+ if (count < slotNames.length - 1) {
+ sb.append(", ");
+ }
+ count++;
+ }
+ sb.append(";\n");
+ return sb.toString();
+ }
+ return "";
}
private String getUseDynamicAnchoring(boolean complete) {
- if(useDynamicAnchoring && complete) {
+ if (useDynamicAnchoring && complete) {
return "Document{-> DYNAMICANCHORING(true)};\n";
} else {
return "";
@@ -454,20 +496,21 @@ public abstract class TextRulerBasicLear
}
private String getTypeSystemImport(boolean complete) {
- if(complete) {
- IPath path = Path.fromOSString(preprocessorTMFile);
- IPath removeLastSegments = path.removeLastSegments(1);
- IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot().getContainerForLocation(removeLastSegments);
- IProject project = containerForLocation.getProject();
- IPath scriptRootPath = RutaProjectUtils.getScriptRootPath(project);
- String moduleName = RutaProjectUtils.getModuleName(path);
- IPath makeRelativeTo = path.makeRelativeTo(scriptRootPath);
- String m = makeRelativeTo.removeFileExtension().toPortableString().replaceAll("/", ".");
- String importString = "SCRIPT " + m + ";\n";
- if (!skip) {
+ if (complete) {
+ IPath path = Path.fromOSString(preprocessorTMFile);
+ IPath removeLastSegments = path.removeLastSegments(1);
+ IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot()
+ .getContainerForLocation(removeLastSegments);
+ IProject project = containerForLocation.getProject();
+ IPath scriptRootPath = RutaProjectUtils.getScriptRootPath(project);
+ String moduleName = RutaProjectUtils.getModuleName(path);
+ IPath makeRelativeTo = path.makeRelativeTo(scriptRootPath);
+ String m = makeRelativeTo.removeFileExtension().toPortableString().replaceAll("/", ".");
+ String importString = "SCRIPT " + m + ";\n";
+ if (!skip) {
importString += "Document{-> CALL(" + moduleName + ")};\n";
- }
- return importString;
+ }
+ return importString;
}
return "";
}
@@ -475,8 +518,10 @@ public abstract class TextRulerBasicLear
public String getPackageString() {
IPath path = Path.fromOSString(preprocessorTMFile);
IPath removeLastSegments = path.removeLastSegments(1);
- IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot().getContainerForLocation(removeLastSegments);
- IPath removeFirstSegments = containerForLocation.getProjectRelativePath().removeFirstSegments(1);
+ IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot()
+ .getContainerForLocation(removeLastSegments);
+ IPath removeFirstSegments = containerForLocation.getProjectRelativePath()
+ .removeFirstSegments(1);
String replaceAll = removeFirstSegments.toPortableString().replaceAll("/", ".");
return "PACKAGE " + replaceAll + ";\n\n";
}
@@ -509,8 +554,8 @@ public abstract class TextRulerBasicLear
// them works without leaking, so we prefer this now since it also
// brought a performance
// boost!
-
- if(configChanged && algTestCAS != null) { // type system maybe changed
+
+ if (configChanged && algTestCAS != null) { // type system maybe changed
GlobalCASSource.releaseCAS(algTestCAS);
algTestCAS = null;
}
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java Thu Jun 6 14:49:04 2013
@@ -37,6 +37,7 @@ import java.util.Set;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
@@ -45,7 +46,8 @@ import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.resource.metadata.TypeDescription;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
@@ -108,13 +110,23 @@ public class TextRulerToolkit {
return FileLocator.find(TextRulerPlugin.getDefault().getBundle(), new Path(name), null);
}
+ public static AnalysisEngineDescription getAnalysisEngineDescription(String descFile) {
+ AnalysisEngineDescription result = null;
+ try {
+ XMLInputSource in = new XMLInputSource(descFile);
+ result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+ } catch (Exception e) {
+ TextRulerPlugin.error(e);
+ result = null;
+ }
+ return result;
+ }
- public static AnalysisEngine loadAnalysisEngine(String descFile) {
- AnalysisEngine result = null;
+ public static AnalysisEngineDescription getAnalysisEngineDescription(URL fileURL) {
+ AnalysisEngineDescription result = null;
try {
- XMLInputSource in = new XMLInputSource(new File(descFile));
- ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
- result = UIMAFramework.produceAnalysisEngine(specifier);
+ XMLInputSource in = new XMLInputSource(fileURL);
+ result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
} catch (Exception e) {
TextRulerPlugin.error(e);
result = null;
@@ -122,12 +134,10 @@ public class TextRulerToolkit {
return result;
}
- public static AnalysisEngine loadAnalysisEngine(URL fileURL) {
+ public static AnalysisEngine loadAnalysisEngine(AnalysisEngineDescription desc) {
AnalysisEngine result = null;
try {
- XMLInputSource in = new XMLInputSource(fileURL);
- ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
- result = UIMAFramework.produceAnalysisEngine(specifier);
+ result = UIMAFramework.produceAnalysisEngine(desc);
} catch (Exception e) {
TextRulerPlugin.error(e);
result = null;
@@ -135,6 +145,21 @@ public class TextRulerToolkit {
return result;
}
+ public static void addBoundaryTypes(AnalysisEngineDescription description, String[] slotNames) {
+ List<String> list = new ArrayList<String>();
+ for (String eachSlot : slotNames) {
+ list.add(eachSlot + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+ list.add(eachSlot + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+ }
+ TypeSystemDescription typeSystem = description.getAnalysisEngineMetaData().getTypeSystem();
+ for (String string : list) {
+ TypeDescription type = typeSystem.getType(string);
+ if (type == null) {
+ typeSystem.addType(string, "", "uima.tcas.Annotation");
+ }
+ }
+ }
+
public static CAS readCASfromXMIFile(String filename, AnalysisEngine ae, CAS reuseCAS) {
return readCASfromXMIFile(new File(filename), ae, reuseCAS);
}
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java Thu Jun 6 14:49:04 2013
@@ -19,6 +19,7 @@
package org.apache.uima.ruta.textruler.core;
+import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
@@ -51,7 +52,8 @@ public class TextRulerWordConstraint {
super();
this.tokenAnnotation = tokenAnnotation;
if (AUTO_REGEXP) {
- TypeSystem ts = tokenAnnotation.getDocument().getCAS().getTypeSystem();
+ CAS cas = tokenAnnotation.getDocument().getCAS();
+ TypeSystem ts = cas.getTypeSystem();
Type wType = ts.getType(TextRulerToolkit.RUTA_WORD_TYPE_NAME);
Type numType = ts.getType(TextRulerToolkit.RUTA_NUM_TYPE_NAME);
Type markupType = ts.getType(TextRulerToolkit.RUTA_MARKUP_TYPE_NAME);
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java Thu Jun 6 14:49:04 2013
@@ -58,7 +58,7 @@ public class TextRulerController {
protected static boolean shouldAbort = false;
protected static String currentPreprocessorTMFile = null;
-
+
protected static boolean skip;
protected static Map<String, Map<String, Object>> currentAlgorithmParams;
@@ -159,7 +159,7 @@ public class TextRulerController {
currentDelegate = delegate;
currentAlgorithmParams = algParams;
skip = skipPreprocessing;
-
+
saveParametersToTempFolder(inFolder, skipPreprocessing);
for (TextRulerLearnerController c : learners)
c.resetStatusString();
@@ -172,7 +172,7 @@ public class TextRulerController {
public void run() {
TextRulerPreprocessor p = new TextRulerPreprocessor();
String algorithmsInputFolder = p.run(inFolder, currentPreprocessorTMFile, currentTempDir,
- new TextRulerPreprocessorDelegate() {
+ currentSlotNames, new TextRulerPreprocessorDelegate() {
public void preprocessorStatusUpdate(TextRulerPreprocessor p,
String statusString) {
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java Thu Jun 6 14:49:04 2013
@@ -23,6 +23,7 @@ import java.io.File;
import java.io.FilenameFilter;
import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.GlobalCASSource;
@@ -37,10 +38,13 @@ import org.eclipse.core.runtime.Path;
*/
public class TextRulerPreprocessor {
- public String run(String inFolder, String tmFile, String tmpDir,
+ public String run(String inFolder, String tmFile, String tmpDir, String[] currentSlotNames,
TextRulerPreprocessorDelegate delegate) {
- AnalysisEngine ae = TextRulerToolkit.loadAnalysisEngine(TextRulerToolkit
+ AnalysisEngineDescription analysisEngineDescription = TextRulerToolkit.getAnalysisEngineDescription(TextRulerToolkit
.getEngineDescriptorFromTMSourceFile(new Path(tmFile)));
+ // we want to reuse these cases, so extend the type system in case a boundary-based learner is called
+ TextRulerToolkit.addBoundaryTypes(analysisEngineDescription, currentSlotNames);
+ AnalysisEngine ae = TextRulerToolkit.loadAnalysisEngine(analysisEngineDescription);
// preprocess input XMIs
File inputFolder = new File(inFolder);
Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java Thu Jun 6 14:49:04 2013
@@ -111,6 +111,7 @@ public abstract class BasicLP2 extends T
public BasicLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
+ supportBoundaries = true;
}
protected TextRulerRuleList learnTaggingRules(TextRulerTarget target,