You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2013/06/06 16:49:05 UTC

svn commit: r1490307 - in /uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler: core/ extension/ learner/lp2/ tools/

Author: pkluegl
Date: Thu Jun  6 14:49:04 2013
New Revision: 1490307

URL: http://svn.apache.org/r1490307
Log:
UIMA-2344
- support boundaries by expanding type systems

Removed:
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/BatchRuleEvaluator.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/BatchRuleScorer.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/F1Scorer.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/tools/SGMLToXMIConverter.java
Modified:
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java
    uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerAnnotation.java Thu Jun  6 14:49:04 2013
@@ -74,7 +74,7 @@ public class TextRulerAnnotation {
   }
 
   public TextRulerAnnotation(AnnotationFS afs, TextRulerExampleDocument document) {
-    this(afs, null, null);
+    this(afs, document, null);
   }
 
   public TextRulerAnnotation(AnnotationFS afs) {

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerBasicLearner.java Thu Jun  6 14:49:04 2013
@@ -30,10 +30,17 @@ import java.util.Set;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASRuntimeException;
+import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.TypeDescription;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.resource.metadata.impl.TypeDescription_impl;
 import org.apache.uima.ruta.engine.RutaEngine;
 import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils;
 import org.apache.uima.ruta.textruler.TextRulerPlugin;
@@ -86,6 +93,8 @@ public abstract class TextRulerBasicLear
 
   private boolean configChanged = false;
 
+  protected boolean supportBoundaries = false;
+
   public TextRulerBasicLearner(String inputDir, String prePropTMFile, String tmpDir,
           String[] slotNames, Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
     super();
@@ -112,9 +121,9 @@ public abstract class TextRulerBasicLear
     useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.BREAK");
     useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.NBSP");
     useDefaultFiltering &= filterSet.contains("org.apache.uima.ruta.type.MARKUP");
-    
+
     configChanged = true;
-    
+
     this.casCache = new CasCache(100, this); // TODO make size configurable
     // !? share e.g. 100 places for
     // all running algoritghms ?
@@ -142,7 +151,13 @@ public abstract class TextRulerBasicLear
     String descriptorFile = TextRulerToolkit.getEngineDescriptorFromTMSourceFile(new Path(
             preprocessorTMFile));
     sendStatusUpdateToDelegate("loading AE...", TextRulerLearnerState.ML_INITIALIZING, false);
-    ae = TextRulerToolkit.loadAnalysisEngine(descriptorFile);
+    
+    AnalysisEngineDescription description = TextRulerToolkit.getAnalysisEngineDescription(descriptorFile);
+    if(supportBoundaries) {
+      TextRulerToolkit.addBoundaryTypes(description, slotNames);
+    }
+    
+    ae = TextRulerToolkit.loadAnalysisEngine(description);
 
     // set filters to NO filtering so that we can add it manually with
     // the FILTERTYPE expression!
@@ -154,17 +169,19 @@ public abstract class TextRulerBasicLear
     ae.setConfigParameterValue(RutaEngine.ADDITIONAL_SCRIPTS, new String[0]);
     ae.setConfigParameterValue(RutaEngine.RELOAD_SCRIPT, true);
     ae.setConfigParameterValue(RutaEngine.REMOVE_BASICS, true);
-    if(useDynamicAnchoring) {
+    if (useDynamicAnchoring) {
       ae.setConfigParameterValue(RutaEngine.DYNAMIC_ANCHORING, true);
     }
-
     try {
       ae.reconfigure();
     } catch (ResourceConfigurationException e) {
       TextRulerPlugin.error(e);
     }
+    configChanged = true;
   }
 
+ 
+
   protected boolean checkForMandatoryTypes() {
     // check if all passed slot types are present:
     CAS someCas = getTestCAS();
@@ -442,11 +459,36 @@ public abstract class TextRulerBasicLear
   }
 
   public String getFileHeaderString(boolean complete) {
-    return getPackageString() + getTypeSystemImport(complete) + getFilterCommandString() + getUseDynamicAnchoring(complete);
+    return getPackageString() + getTypeSystemImport(complete) + getFilterCommandString()
+            + getUseDynamicAnchoring(complete) + getBoundaryDeclarations(complete);
+  }
+
+  private String getBoundaryDeclarations(boolean complete) {
+    if (complete && supportBoundaries && slotNames.length > 0) {
+      StringBuilder sb = new StringBuilder();
+      sb.append("DECLARE ");
+      int count = 0;
+      for (String slot : slotNames) {
+        String[] split = slot.split("[.]");
+        String shortName = split[split.length - 1];
+        sb.append(shortName);
+        sb.append("START");
+        sb.append(", ");
+        sb.append(shortName);
+        sb.append("END");
+        if (count < slotNames.length - 1) {
+          sb.append(", ");
+        }
+        count++;
+      }
+      sb.append(";\n");
+      return sb.toString();
+    }
+    return "";
   }
 
   private String getUseDynamicAnchoring(boolean complete) {
-    if(useDynamicAnchoring && complete) {
+    if (useDynamicAnchoring && complete) {
       return "Document{-> DYNAMICANCHORING(true)};\n";
     } else {
       return "";
@@ -454,20 +496,21 @@ public abstract class TextRulerBasicLear
   }
 
   private String getTypeSystemImport(boolean complete) {
-    if(complete) {
-    IPath path = Path.fromOSString(preprocessorTMFile);
-    IPath removeLastSegments = path.removeLastSegments(1);
-    IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot().getContainerForLocation(removeLastSegments);
-    IProject project = containerForLocation.getProject();
-    IPath scriptRootPath = RutaProjectUtils.getScriptRootPath(project);
-    String moduleName = RutaProjectUtils.getModuleName(path);
-    IPath makeRelativeTo = path.makeRelativeTo(scriptRootPath);
-    String m = makeRelativeTo.removeFileExtension().toPortableString().replaceAll("/", ".");
-    String importString = "SCRIPT " + m + ";\n";
-    if (!skip) {
+    if (complete) {
+      IPath path = Path.fromOSString(preprocessorTMFile);
+      IPath removeLastSegments = path.removeLastSegments(1);
+      IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot()
+              .getContainerForLocation(removeLastSegments);
+      IProject project = containerForLocation.getProject();
+      IPath scriptRootPath = RutaProjectUtils.getScriptRootPath(project);
+      String moduleName = RutaProjectUtils.getModuleName(path);
+      IPath makeRelativeTo = path.makeRelativeTo(scriptRootPath);
+      String m = makeRelativeTo.removeFileExtension().toPortableString().replaceAll("/", ".");
+      String importString = "SCRIPT " + m + ";\n";
+      if (!skip) {
         importString += "Document{-> CALL(" + moduleName + ")};\n";
-    }
-    return importString;
+      }
+      return importString;
     }
     return "";
   }
@@ -475,8 +518,10 @@ public abstract class TextRulerBasicLear
   public String getPackageString() {
     IPath path = Path.fromOSString(preprocessorTMFile);
     IPath removeLastSegments = path.removeLastSegments(1);
-    IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot().getContainerForLocation(removeLastSegments);
-    IPath removeFirstSegments = containerForLocation.getProjectRelativePath().removeFirstSegments(1);
+    IContainer containerForLocation = ResourcesPlugin.getWorkspace().getRoot()
+            .getContainerForLocation(removeLastSegments);
+    IPath removeFirstSegments = containerForLocation.getProjectRelativePath()
+            .removeFirstSegments(1);
     String replaceAll = removeFirstSegments.toPortableString().replaceAll("/", ".");
     return "PACKAGE " + replaceAll + ";\n\n";
   }
@@ -509,8 +554,8 @@ public abstract class TextRulerBasicLear
     // them works without leaking, so we prefer this now since it also
     // brought a performance
     // boost!
-    
-    if(configChanged && algTestCAS != null) { // type system maybe changed
+
+    if (configChanged && algTestCAS != null) { // type system maybe changed
       GlobalCASSource.releaseCAS(algTestCAS);
       algTestCAS = null;
     }

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerToolkit.java Thu Jun  6 14:49:04 2013
@@ -37,6 +37,7 @@ import java.util.Set;
 
 import org.apache.uima.UIMAFramework;
 import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Feature;
@@ -45,7 +46,8 @@ import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.impl.XmiCasDeserializer;
 import org.apache.uima.cas.impl.XmiCasSerializer;
 import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.resource.metadata.TypeDescription;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.apache.uima.ruta.engine.RutaEngine;
 import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils;
 import org.apache.uima.ruta.textruler.TextRulerPlugin;
@@ -108,13 +110,23 @@ public class TextRulerToolkit {
     return FileLocator.find(TextRulerPlugin.getDefault().getBundle(), new Path(name), null);
   }
 
+  public static AnalysisEngineDescription getAnalysisEngineDescription(String descFile) {
+    AnalysisEngineDescription result = null;
+    try {
+      XMLInputSource in = new XMLInputSource(descFile);
+      result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    } catch (Exception e) {
+      TextRulerPlugin.error(e);
+      result = null;
+    }
+    return result;
+  }
 
-  public static AnalysisEngine loadAnalysisEngine(String descFile) {
-    AnalysisEngine result = null;
+  public static AnalysisEngineDescription getAnalysisEngineDescription(URL fileURL) {
+    AnalysisEngineDescription result = null;
     try {
-      XMLInputSource in = new XMLInputSource(new File(descFile));
-      ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
-      result = UIMAFramework.produceAnalysisEngine(specifier);
+      XMLInputSource in = new XMLInputSource(fileURL);
+      result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
     } catch (Exception e) {
       TextRulerPlugin.error(e);
       result = null;
@@ -122,12 +134,10 @@ public class TextRulerToolkit {
     return result;
   }
 
-  public static AnalysisEngine loadAnalysisEngine(URL fileURL) {
+  public static AnalysisEngine loadAnalysisEngine(AnalysisEngineDescription desc) {
     AnalysisEngine result = null;
     try {
-      XMLInputSource in = new XMLInputSource(fileURL);
-      ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
-      result = UIMAFramework.produceAnalysisEngine(specifier);
+      result = UIMAFramework.produceAnalysisEngine(desc);
     } catch (Exception e) {
       TextRulerPlugin.error(e);
       result = null;
@@ -135,6 +145,21 @@ public class TextRulerToolkit {
     return result;
   }
 
+  public static void addBoundaryTypes(AnalysisEngineDescription description, String[] slotNames) {
+    List<String> list = new ArrayList<String>();
+    for (String eachSlot : slotNames) {
+      list.add(eachSlot + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
+      list.add(eachSlot + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
+    }
+    TypeSystemDescription typeSystem = description.getAnalysisEngineMetaData().getTypeSystem();
+    for (String string : list) {
+      TypeDescription type = typeSystem.getType(string);
+      if (type == null) {
+        typeSystem.addType(string, "", "uima.tcas.Annotation");
+      }
+    }
+  }
+
   public static CAS readCASfromXMIFile(String filename, AnalysisEngine ae, CAS reuseCAS) {
     return readCASfromXMIFile(new File(filename), ae, reuseCAS);
   }

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/core/TextRulerWordConstraint.java Thu Jun  6 14:49:04 2013
@@ -19,6 +19,7 @@
 
 package org.apache.uima.ruta.textruler.core;
 
+import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 
@@ -51,7 +52,8 @@ public class TextRulerWordConstraint {
     super();
     this.tokenAnnotation = tokenAnnotation;
     if (AUTO_REGEXP) {
-      TypeSystem ts = tokenAnnotation.getDocument().getCAS().getTypeSystem();
+      CAS cas = tokenAnnotation.getDocument().getCAS();
+      TypeSystem ts = cas.getTypeSystem();
       Type wType = ts.getType(TextRulerToolkit.RUTA_WORD_TYPE_NAME);
       Type numType = ts.getType(TextRulerToolkit.RUTA_NUM_TYPE_NAME);
       Type markupType = ts.getType(TextRulerToolkit.RUTA_MARKUP_TYPE_NAME);

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerController.java Thu Jun  6 14:49:04 2013
@@ -58,7 +58,7 @@ public class TextRulerController {
   protected static boolean shouldAbort = false;
 
   protected static String currentPreprocessorTMFile = null;
-  
+
   protected static boolean skip;
 
   protected static Map<String, Map<String, Object>> currentAlgorithmParams;
@@ -159,7 +159,7 @@ public class TextRulerController {
     currentDelegate = delegate;
     currentAlgorithmParams = algParams;
     skip = skipPreprocessing;
-    
+
     saveParametersToTempFolder(inFolder, skipPreprocessing);
     for (TextRulerLearnerController c : learners)
       c.resetStatusString();
@@ -172,7 +172,7 @@ public class TextRulerController {
         public void run() {
           TextRulerPreprocessor p = new TextRulerPreprocessor();
           String algorithmsInputFolder = p.run(inFolder, currentPreprocessorTMFile, currentTempDir,
-                  new TextRulerPreprocessorDelegate() {
+                  currentSlotNames, new TextRulerPreprocessorDelegate() {
 
                     public void preprocessorStatusUpdate(TextRulerPreprocessor p,
                             String statusString) {

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/extension/TextRulerPreprocessor.java Thu Jun  6 14:49:04 2013
@@ -23,6 +23,7 @@ import java.io.File;
 import java.io.FilenameFilter;
 
 import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.ruta.textruler.TextRulerPlugin;
 import org.apache.uima.ruta.textruler.core.GlobalCASSource;
@@ -37,10 +38,13 @@ import org.eclipse.core.runtime.Path;
  */
 public class TextRulerPreprocessor {
 
-  public String run(String inFolder, String tmFile, String tmpDir,
+  public String run(String inFolder, String tmFile, String tmpDir, String[] currentSlotNames,
           TextRulerPreprocessorDelegate delegate) {
-    AnalysisEngine ae = TextRulerToolkit.loadAnalysisEngine(TextRulerToolkit
+    AnalysisEngineDescription analysisEngineDescription = TextRulerToolkit.getAnalysisEngineDescription(TextRulerToolkit
             .getEngineDescriptorFromTMSourceFile(new Path(tmFile)));
+    // we want to reuse these cases, so extend the type system in case a boundary-based learner is called
+    TextRulerToolkit.addBoundaryTypes(analysisEngineDescription, currentSlotNames);
+    AnalysisEngine ae = TextRulerToolkit.loadAnalysisEngine(analysisEngineDescription);
 
     // preprocess input XMIs
     File inputFolder = new File(inFolder);

Modified: uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java
URL: http://svn.apache.org/viewvc/uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java?rev=1490307&r1=1490306&r2=1490307&view=diff
==============================================================================
--- uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java (original)
+++ uima/sandbox/ruta/trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/lp2/BasicLP2.java Thu Jun  6 14:49:04 2013
@@ -111,6 +111,7 @@ public abstract class BasicLP2 extends T
   public BasicLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
           Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
     super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
+    supportBoundaries = true;
   }
 
   protected TextRulerRuleList learnTaggingRules(TextRulerTarget target,